# 将tsv中的mapping结合起来

In [2]:
app_tsv = "data/app_body_mapping.tsv"
bibli_tsv =  "data/bibli_mapping.tsv"

In [4]:
!head  data/*_mapping.tsv

==> data/app_body_mapping.tsv <==
application-body/@lang	pat:ApplicationBody/@com:languageCode
application-body/@file	pat:ApplicationBody/@com:documentFileName
application-body/@file-reference-id	pat:ApplicationBody/@com:fileReferenceIdentifier
application-body/@country	pat:ApplicationBody/@com:receivingOffice
application-body/@status	pat:ApplicationBody/@pat:applicationBodyStatus
application-body/doc-page	pat:ApplicationBody/pat:DocumentURI
application-body/description	pat:ApplicationBody/pat:Description
application-body/description/@id	pat:ApplicationBody/pat:Description/@com:id
application-body/description/@lang	pat:ApplicationBody/pat:Description/@com:languageCode
application-body/description/doc-page	pat:ApplicationBody/pat:Description/pat:DocumentURI

==> data/bibli_mapping.tsv <==
bibliographic-data/@id	pat:BibliographicData/@com:id
bibliographic-data/@country	pat:BibliographicData/@com:officeCode
bibliographic-data/plain-language-designation	pat:BibliographicData

In [5]:
import pandas as pd

In [6]:
app_df = pd.read_csv(app_tsv, sep="\t", header=None, names=['xpath_from', "xpath_to"])

In [7]:
app_df.head()

Unnamed: 0,xpath_from,xpath_to
0,application-body/@lang,pat:ApplicationBody/@com:languageCode
1,application-body/@file,pat:ApplicationBody/@com:documentFileName
2,application-body/@file-reference-id,pat:ApplicationBody/@com:fileReferenceIdentifier
3,application-body/@country,pat:ApplicationBody/@com:receivingOffice
4,application-body/@status,pat:ApplicationBody/@pat:applicationBodyStatus


In [8]:
app_df['xpath_to'] = app_df['xpath_to'].str.replace("pat:ApplicationBody", "pat:PatentPublication")

In [10]:
app_df['xpath_from'] = app_df['xpath_from'].str.replace("application-body", "patent-document")

In [11]:
app_df

Unnamed: 0,xpath_from,xpath_to
0,patent-document/@lang,pat:PatentPublication/@com:languageCode
1,patent-document/@file,pat:PatentPublication/@com:documentFileName
2,patent-document/@file-reference-id,pat:PatentPublication/@com:fileReferenceIdenti...
3,patent-document/@country,pat:PatentPublication/@com:receivingOffice
4,patent-document/@status,pat:PatentPublication/@pat:applicationBodyStatus
...,...,...
892,state,com:GeographicRegionName
893,pobox,com:AddressLineText
894,room,com:AddressLineText
895,county,com:GeographicRegionName


In [12]:
bb_df = pd.read_csv(bibli_tsv, sep="\t", header=None, names=['xpath_from', "xpath_to"])

In [13]:
bb_df

Unnamed: 0,xpath_from,xpath_to
0,bibliographic-data/@id,pat:BibliographicData/@com:id
1,bibliographic-data/@country,pat:BibliographicData/@com:officeCode
2,bibliographic-data/plain-language-designation,pat:BibliographicData/pat:PlainLanguageDesigna...
3,bibliographic-data/plain-language-designation/...,pat:BibliographicData/pat:PlainLanguageDesigna...
4,bibliographic-data/publication-reference,pat:BibliographicData/pat:PatentPublicationIde...
...,...,...
530,address-floor,com:AddressLineText
531,building,com:AddressLineText
532,street,com:AddressLineText
533,address-4,com:AddressLineText


In [14]:
first_row = "patent-document/bibliographic-data\tpat:PatentPublication/pat:BibliographicData"

In [15]:
from io import StringIO

In [16]:
with open("data/head_mapping.tsv", 'w') as f:
    f.write(first_row + "\n")
    

In [17]:
!cat data/head_mapping.tsv data/bibli_mapping.tsv data/app_body_mapping.tsv > element_mapping.tsv

In [18]:
!cat data/bibli_mapping_value.tsv data/app_body_mapping_value.tsv > element_value_mapping.tsv

In [19]:
!head  element_mapping.tsv element_value_mapping.tsv

==> element_mapping.tsv <==
patent-document/bibliographic-data	pat:PatentPublication/pat:BibliographicData
bibliographic-data/@id	pat:BibliographicData/@com:id
bibliographic-data/@country	pat:BibliographicData/@com:officeCode
bibliographic-data/plain-language-designation	pat:BibliographicData/pat:PlainLanguageDesignationText
bibliographic-data/plain-language-designation/@lang	pat:BibliographicData/pat:PlainLanguageDesignationText/@com:languageCode
bibliographic-data/publication-reference	pat:BibliographicData/pat:PatentPublicationIdentification
bibliographic-data/classification-ipc	pat:BibliographicData/pat:PatentClassificationBag/pat:IPCClassification
bibliographic-data/classifications-ipcr	pat:BibliographicData/pat:PatentClassificationBag/pat:IPCRClassificationBag
bibliographic-data/classifications-ipcr/classification-ipcr	pat:BibliographicData/pat:PatentClassificationBag/pat:IPCRClassificationBag/pat:IPCRClassification
bibliographic-data/classification-national	pat:Bibliog

In [20]:
df = pd.read_csv("element_mapping.tsv", sep="\t", header=None, names=['xpath_from', "xpath_to"])

In [21]:
df['xpath_to'] = df['xpath_to'].str.replace("pat:ApplicationBody", "pat:PatentPublication")
df['xpath_from'] = df['xpath_from'].str.replace("application-body", "patent-document")

In [22]:
df


Unnamed: 0,xpath_from,xpath_to
0,patent-document/bibliographic-data,pat:PatentPublication/pat:BibliographicData
1,bibliographic-data/@id,pat:BibliographicData/@com:id
2,bibliographic-data/@country,pat:BibliographicData/@com:officeCode
3,bibliographic-data/plain-language-designation,pat:BibliographicData/pat:PlainLanguageDesigna...
4,bibliographic-data/plain-language-designation/...,pat:BibliographicData/pat:PlainLanguageDesigna...
...,...,...
1428,state,com:GeographicRegionName
1429,pobox,com:AddressLineText
1430,room,com:AddressLineText
1431,county,com:GeographicRegionName


In [25]:
value_df = pd.read_csv("element_value_mapping.tsv", sep="\t", header=None, 
                       names=['from_element', 'to_element', 'from_value', 'to_value'])

In [28]:
value_df.to_csv("data/mapping_table/element_value_mapping.csv", index=False)


In [29]:
df.to_csv("data/mapping_table/element_mapping.csv", index=False)