In [1]:
import pandas as pd

### Load data

In [2]:
inventor = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_inventor_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
attorney = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_attorney_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
location = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_location_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
patent = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_patent.tsv.zip", sep="\t", dtype=str, compression="zip")
application = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_application.tsv.zip", sep="\t", dtype=str, compression="zip")
assignee = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_assignee_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
cpc_current = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_cpc_current.tsv.zip", sep="\t", dtype=str, compression="zip")
persistent_inventor = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_persistent_inventor.tsv.zip", sep="\t", dtype=str, compression="zip")

In [3]:
from pv_evaluation.benchmark import load_binette_2022_inventors_benchmark

binette_benchmark = load_binette_2022_inventors_benchmark()

Add "mention_id"

In [4]:
persistent_inventor['mention_id'] = "US" + persistent_inventor["patent_id"] + "-" + persistent_inventor["sequence"]
inventor['mention_id'] = "US" + inventor["patent_id"] + "-" + inventor["inventor_sequence"]

Add block IDs

In [5]:
inventor["block"] = inventor["inventor_id"].str.replace("-[1-9]+", "", regex=True)

### Subset inventor mentions to blocks which intersect benchmark

In [6]:
pv_disamb = persistent_inventor.set_index("mention_id")["disamb_inventor_id_20211230"]

intersecting_blocks = inventor.set_index("mention_id").loc[binette_benchmark.index.values, "block"]

# Subset to sampled blocks
inventor_subset = inventor[inventor["block"].isin(intersecting_blocks)]

# Subset to inventor mentions which appear in the 2022/12/31 disambiguation
inventor_subset = inventor_subset[inventor_subset["mention_id"].isin(pv_disamb.index)]

### Add features

#### Ground truth

In [7]:
# Add ground truth clusters as "unique_id"
inventor_subset = inventor_subset.merge(binette_benchmark.reset_index(), on="mention_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,
...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,


#### Location

In [8]:
inventor_subset = inventor_subset.merge(location, on="rawlocation_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id,location_id,raw_city,raw_state,raw_country
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,,1d2251c8-16c8-11ed-9b5f-1234bde3cd05,Yokohama,,JPX
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,,13f05eea-16c8-11ed-9b5f-1234bde3cd05,San Jose,CA,US
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,,3eea8bf8-16c8-11ed-9b5f-1234bde3cd05,Suwon-si,,KR
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,,1aeb3c64-16c8-11ed-9b5f-1234bde3cd05,Seoul,,KR
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,,210d529a-16c8-11ed-9b5f-1234bde3cd05,Narashino,,JP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,,11f50cd8-16c8-11ed-9b5f-1234bde3cd05,Susono,,JP
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,,9df01a7b-16c8-11ed-9b5f-1234bde3cd05,Akashi,,JP
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,,42f90495-16c8-11ed-9b5f-1234bde3cd05,Tokyo,,JP
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,,93778f03-16c8-11ed-9b5f-1234bde3cd05,Shenzhen,,CN


#### Attorney

In [9]:
attorney_subset = attorney[attorney["patent_id"].isin(inventor_subset["patent_id"])]
attorney_by_patent = attorney_subset.groupby("patent_id").agg({
    "raw_attorney_name_first": list,
    "raw_attorney_name_last": list,
    "raw_attorney_organization": list,
    "attorney_country": list,
    "attorney_sequence": list
})
inventor_subset = inventor_subset.merge(attorney_by_patent, on="patent_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id,location_id,raw_city,raw_state,raw_country,raw_attorney_name_first,raw_attorney_name_last,raw_attorney_organization,attorney_country,attorney_sequence
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,,1d2251c8-16c8-11ed-9b5f-1234bde3cd05,Yokohama,,JPX,[nan],[nan],"[Fitzpatrick, Cella, Harper & Scinto]",[nan],[0]
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,,13f05eea-16c8-11ed-9b5f-1234bde3cd05,San Jose,CA,US,"[nan, Steve]","[nan, Mendelsohn]","[Mendelsohn, Drucker & Associates, P.C., nan]","[nan, nan]","[0, 1]"
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,,3eea8bf8-16c8-11ed-9b5f-1234bde3cd05,Suwon-si,,KR,[nan],[nan],[K&L Gates LLP],[nan],[0]
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,,1aeb3c64-16c8-11ed-9b5f-1234bde3cd05,Seoul,,KR,[nan],[nan],[Cantor Colburn LLP],[nan],[0]
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,,210d529a-16c8-11ed-9b5f-1234bde3cd05,Narashino,,JP,[nan],[nan],"[Sughrue, Mion, Zinn, Macpeak & Seas]",[nan],[0]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,,11f50cd8-16c8-11ed-9b5f-1234bde3cd05,Susono,,JP,[nan],[nan],"[Cushman, Darby & Cushman]",[nan],[0]
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,,9df01a7b-16c8-11ed-9b5f-1234bde3cd05,Akashi,,JP,[nan],[nan],"[Oblon, Fisher, Spivak, McClelland & Maier]",[nan],[0]
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,,42f90495-16c8-11ed-9b5f-1234bde3cd05,Tokyo,,JP,[nan],[nan],[Duane Morris LLP],[nan],[0]
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,,93778f03-16c8-11ed-9b5f-1234bde3cd05,Shenzhen,,CN,[nan],[nan],"[Conley Rose, P.C.]",[nan],[0]


#### Patent information

In [10]:
patent_subset = patent[patent["patent_id"].isin(inventor_subset["patent_id"])]
inventor_subset = inventor_subset.merge(patent_subset, on="patent_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id,...,attorney_country,attorney_sequence,patent_type,patent_date,patent_title,patent_abstract,wipo_kind,num_claims,withdrawn,filename
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,,...,[nan],[0],utility,1998-10-27,Recording apparatus with compensation for vari...,A recording apparatus employs a control unit ...,A,8,0,pftaps19981027_wk43.zip
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,,...,"[nan, nan]","[0, 1]",utility,2011-10-04,Frequency-based approach for detection and cla...,"In a hard-disc drive read channel, frequency-b...",B2,23,0,ipg111004.xml
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,,...,[nan],[0],utility,2020-06-23,Cryogenic cooling apparatus and connecting str...,The present invention relates to a cryogenic c...,B2,20,0,ipg200623.xml
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,,...,[nan],[0],utility,2011-07-12,Liquid crystals and liquid crystal display app...,A liquid crystal includes about 50 wt % to abo...,B2,14,0,ipg110712.xml
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,,...,[nan],[0],utility,1991-12-17,Method for joining metallic members,A method for joining a porous metallic member...,A,2,0,pftaps19911217_wk51.zip
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,,...,[nan],[0],utility,1988-12-27,Electromagnetic coupling,An electromagnetic coupling selectively trans...,A,8,0,pftaps19881227_wk52.zip
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,,...,[nan],[0],utility,1987-06-16,Method of analyzing oxygen or nitrogen contain...,A method of quantitatively analyzing oxygen o...,A,4,0,pftaps19870616_wk24.zip
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,,...,[nan],[0],utility,2017-08-22,"Information input/output device, and medium, u...",To achieve an information output device in whi...,B2,11,0,ipg170822.xml
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,,...,[nan],[0],utility,2019-01-08,Network security method and device,A network security method and a device relatin...,B2,14,0,ipg190108.xml


#### Application

In [11]:
application_subset = application[application["patent_id"].isin(inventor_subset["patent_id"])]
inventor_subset = inventor_subset.merge(application_subset, on="patent_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id,...,patent_abstract,wipo_kind,num_claims,withdrawn,filename,application_id,patent_application_type,filing_date,series_code,rule_47_flag
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,,...,A recording apparatus employs a control unit ...,A,8,0,pftaps19981027_wk43.zip,8,08,1994-12-06,08,0.0
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,,...,"In a hard-disc drive read channel, frequency-b...",B2,23,0,ipg111004.xml,2010/12707820,12,2010-02-18,12,0.0
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,,...,The present invention relates to a cryogenic c...,B2,20,0,ipg200623.xml,2014/15526035,15,2014-11-11,15,0.0
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,,...,A liquid crystal includes about 50 wt % to abo...,B2,14,0,ipg110712.xml,2008/12346218,12,2008-12-30,12,0.0
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,,...,A method for joining a porous metallic member...,A,2,0,pftaps19911217_wk51.zip,07/417945,07,1989-10-06,07,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,,...,An electromagnetic coupling selectively trans...,A,8,0,pftaps19881227_wk52.zip,07/080007,07,1987-07-31,07,0.0
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,,...,A method of quantitatively analyzing oxygen o...,A,4,0,pftaps19870616_wk24.zip,06/739271,06,1985-05-30,06,0.0
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,,...,To achieve an information output device in whi...,B2,11,0,ipg170822.xml,2016/15054599,15,2016-02-26,15,0.0
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,,...,A network security method and a device relatin...,B2,14,0,ipg190108.xml,2016/15153195,15,2016-05-12,15,0.0


#### Assignees

In [12]:
assignee_subset = assignee[assignee["patent_id"].isin(inventor_subset["patent_id"])]

assignee_by_patent = assignee_subset.groupby("patent_id").agg({
    "assignee_sequence":list,
    "raw_assignee_individual_name_first":list,
    "raw_assignee_individual_name_last":list,
    "raw_assignee_organization": list,
    "assignee_type":list
})
inventor_subset = inventor_subset.merge(assignee_by_patent, on="patent_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id,...,application_id,patent_application_type,filing_date,series_code,rule_47_flag,assignee_sequence,raw_assignee_individual_name_first,raw_assignee_individual_name_last,raw_assignee_organization,assignee_type
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,,...,8,08,1994-12-06,08,0.0,[0],[nan],[nan],[Canon Kabushiki Kaisha],[3.0]
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,,...,2010/12707820,12,2010-02-18,12,0.0,[0],[nan],[nan],[LSI Corporation],[2.0]
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,,...,2014/15526035,15,2014-11-11,15,0.0,[0],[nan],[nan],[LS CABLE & SYSTEM LTD.],[3.0]
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,,...,2008/12346218,12,2008-12-30,12,0.0,[0],[nan],[nan],"[Samsung Electronics Co., Ltd.]",[3.0]
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,,...,07/417945,07,1989-10-06,07,0.0,[0],[nan],[nan],"[NDC Co., Ltd.]",[3.0]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,,...,07/080007,07,1987-07-31,07,0.0,"[0, 2, 1]","[nan, nan, nan]","[nan, nan, nan]","[Nippondenso Co., Ltd., Bridgestone Corporatio...","[3.0, 3.0, 3.0]"
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,,...,06/739271,06,1985-05-30,06,0.0,[0],[nan],[nan],[Kabushiki Kaisha Kobe Seiko Sho],[3.0]
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,,...,2016/15054599,15,2016-02-26,15,0.0,,,,,
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,,...,2016/15153195,15,2016-05-12,15,0.0,[0],[nan],[nan],"[HUAWEI TECHNOLOGIES CO., LTD.]",[3.0]


#### CPC

In [13]:
cpc_current_subset = cpc_current[cpc_current["patent_id"].isin(inventor_subset["patent_id"])]
cpc_by_patent = cpc_current_subset.groupby("patent_id").agg({
    "cpc_sequence":list,
    "cpc_section": list,
    "cpc_class": list,
    "cpc_subclass": list,
    "cpc_group": list,
    "cpc_type": list,
})
inventor_subset = inventor_subset.merge(cpc_by_patent, on="patent_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id,...,raw_assignee_individual_name_first,raw_assignee_individual_name_last,raw_assignee_organization,assignee_type,cpc_sequence,cpc_section,cpc_class,cpc_subclass,cpc_group,cpc_type
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,,...,[nan],[nan],[Canon Kabushiki Kaisha],[3.0],"[6, 2, 0, 1, 5, 9, 8, 7, 4, 10, 3]","[H, H, H, H, H, H, H, H, H, H, H]","[H04, H04, H04, H04, H04, H04, H04, H04, H04, ...","[H04N, H04N, H04N, H04N, H04N, H04N, H04N, H04...","[H04N2201/04724, H04N1/12, H04N1/506, H04N1/04...","[additional, additional, inventional, inventio..."
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,,...,[nan],[nan],[LSI Corporation],[2.0],"[2, 1, 0]","[G, G, G]","[G11, G11, G11]","[G11B, G11B, G11B]","[G11B27/36, G11B19/041, G11B20/1816]","[inventional, inventional, inventional]"
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,,...,[nan],[nan],[LS CABLE & SYSTEM LTD.],[3.0],"[5, 2, 1, 0, 3, 4]","[Y, F, H, F, F, H]","[Y02, F25, H01, F25, F25, H01]","[Y02E, F25B, H01B, F25B, F25D, H01F]","[Y02E40/60, F25B9/00, H01B12/16, F25B9/002, F2...","[additional, inventional, inventional, inventi..."
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,,...,[nan],[nan],"[Samsung Electronics Co., Ltd.]",[3.0],"[3, 2, 9, 4, 7, 8, 11, 1, 0, 6, 10, 12, 5]","[C, G, C, C, C, C, G, C, C, C, G, G, C]","[C09, G02, C09, C09, C09, C09, G02, C09, C09, ...","[C09K, G02F, C09K, C09K, C09K, C09K, G02F, C09...","[C09K2019/0407, G02F1/137, C09K2323/03, C09K20...","[additional, inventional, additional, addition..."
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,,...,[nan],[nan],"[NDC Co., Ltd.]",[3.0],"[0, 1]","[B, B]","[B23, B23]","[B23K, B23K]","[B23K11/20, B23K11/14]","[inventional, inventional]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,,...,"[nan, nan, nan]","[nan, nan, nan]","[Nippondenso Co., Ltd., Bridgestone Corporatio...","[3.0, 3.0, 3.0]","[2, 4, 1, 0, 3]","[F, F, F, F, F]","[F16, F16, F16, F16, F16]","[F16D, F16D, F16D, F16D, F16D]","[F16D27/112, F16D2027/008, F16D3/68, F16D3/76,...","[inventional, additional, inventional, inventi..."
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,,...,[nan],[nan],[Kabushiki Kaisha Kobe Seiko Sho],[3.0],[0],[G],[G01],[G01N],[G01N33/2025],[inventional]
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,,...,,,,,"[12, 4, 17, 15, 5, 18, 9, 13, 0, 10, 11, 19, 1...","[G, A, A, H, A, A, G, H, G, G, G, A, A, G, G, ...","[G07, A63, A63, H04, A63, A63, G06, H04, G06, ...","[G07F, A63F, A63F, H04N, A63F, A63F, G06K, H04...","[G07F17/322, A63F13/213, A63F2001/008, H04N5/2...","[inventional, inventional, additional, inventi..."
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,,...,[nan],[nan],"[HUAWEI TECHNOLOGIES CO., LTD.]",[3.0],"[0, 1, 2, 3]","[H, H, H, H]","[H04, H04, H04, H04]","[H04L, H04L, H04L, H04L]","[H04L63/20, H04L63/0218, H04L63/0236, H04L63/104]","[inventional, inventional, inventional, additi..."


#### Co-inventors

In [14]:
coinventors_subset = inventor[inventor["patent_id"].isin(inventor_subset["patent_id"])]
coinventors_by_patent = coinventors_subset.groupby("patent_id").agg({
    "inventor_sequence":list,
    "raw_inventor_name_first": list,
    "raw_inventor_name_last": list,
})
coinventors_by_patent = coinventors_by_patent.rename(columns={
    "inventor_sequence": "coinventor_sequence",
    "raw_inventor_name_first": "coinventor_name_first",
    "raw_inventor_name_last": "coinventor_name_last"
    })
inventor_subset = inventor_subset.merge(coinventors_by_patent, on="patent_id", how="left")
inventor_subset

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,mention_id,block,unique_id,...,assignee_type,cpc_sequence,cpc_section,cpc_class,cpc_subclass,cpc_group,cpc_type,coinventor_sequence,coinventor_name_first,coinventor_name_last
0,5828387,4,fl:ha_ln:takahashi-18,Haruhiko,Takahashi,FALSE,up2wnx1jqb31andxw48p7h0w5,US5828387-4,fl:ha_ln:takahashi,,...,[3.0],"[6, 2, 0, 1, 5, 9, 8, 7, 4, 10, 3]","[H, H, H, H, H, H, H, H, H, H, H]","[H04, H04, H04, H04, H04, H04, H04, H04, H04, ...","[H04N, H04N, H04N, H04N, H04N, H04N, H04N, H04...","[H04N2201/04724, H04N1/12, H04N1/506, H04N1/04...","[additional, additional, inventional, inventio...","[4, 5, 2, 3, 0, 1, 6]","[Haruhiko, Makoto, Hidejiro, Ken, Masafumi, To...","[Takahashi, Takamiya, Kadowaki, Tsuchii, Watay..."
1,8031420,4,fl:yu_ln:lee-197,Yuan Xing,Lee,FALSE,1lost5de91g151xmycuyw2o6k,US8031420-4,fl:yu_ln:lee,,...,[2.0],"[2, 1, 0]","[G, G, G]","[G11, G11, G11]","[G11B, G11B, G11B]","[G11B27/36, G11B19/041, G11B20/1816]","[inventional, inventional, inventional]","[4, 3, 1, 2, 0]","[Yuan Xing, Zongwang, Yang, Shaohua, George]","[Lee, Li, Han, Yang, Mathew]"
2,10692631,0,fl:se_ln:lee-473,Seok Ju,Lee,FALSE,qguhzxr55o7oegny2obuiuwac,US10692631-0,fl:se_ln:lee,,...,[3.0],"[5, 2, 1, 0, 3, 4]","[Y, F, H, F, F, H]","[Y02, F25, H01, F25, F25, H01]","[Y02E, F25B, H01B, F25B, F25D, H01F]","[Y02E40/60, F25B9/00, H01B12/16, F25B9/002, F2...","[additional, inventional, inventional, inventi...","[0, 3, 1, 2, 4]","[Seok Ju, Heo Gyung, Han Joong, Chang Yeol, Ji...","[Lee, Sung, Kim, Choi, Na]"
3,7976910,2,fl:se_ln:lee-37,Seong-Nam,Lee,FALSE,ccsmmyiqezysgon3iczipzgg7,US7976910-2,fl:se_ln:lee,,...,[3.0],"[3, 2, 9, 4, 7, 8, 11, 1, 0, 6, 10, 12, 5]","[C, G, C, C, C, C, G, C, C, C, G, G, C]","[C09, G02, C09, C09, C09, C09, G02, C09, C09, ...","[C09K, G02F, C09K, C09K, C09K, C09K, G02F, C09...","[C09K2019/0407, G02F1/137, C09K2323/03, C09K20...","[additional, inventional, additional, addition...","[2, 0, 1, 3]","[Seong-Nam, Seon-Ah, Jang-Hyun, Jeong-Uk]","[Lee, Cho, Kim, Heo]"
4,5073693,0,fl:hi_ln:kikuchi-44,Hiroyoshi,Kikuchi,FALSE,2kl0hp6x059e2k3ta5yzvzlu2,US5073693-0,fl:hi_ln:kikuchi,,...,[3.0],"[0, 1]","[B, B]","[B23, B23]","[B23K, B23K]","[B23K11/20, B23K11/14]","[inventional, inventional]","[0, 1, 2]","[Hiroyoshi, Hiroto, Syunsuke]","[Kikuchi, Kosuge, Suzuki]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133536,4793455,2,fl:ki_ln:kato-42,Kichiro,Kato,FALSE,auiqhejy00iud5vbq6tzb38p7,US4793455-2,fl:ki_ln:kato,,...,"[3.0, 3.0, 3.0]","[2, 4, 1, 0, 3]","[F, F, F, F, F]","[F16, F16, F16, F16, F16]","[F16D, F16D, F16D, F16D, F16D]","[F16D27/112, F16D2027/008, F16D3/68, F16D3/76,...","[inventional, additional, inventional, inventi...","[3, 0, 1, 2]","[Tomoyuki, Yasuo, Masao, Kichiro]","[Kurata, Tabuchi, Nakano, Kato]"
133537,4673655,2,fl:hi_ln:yamada-29,Hiromichi,Yamada,FALSE,fzfi1c54nh8bzsf9cnthxxb80,US4673655-2,fl:hi_ln:yamada,,...,[3.0],[0],[G],[G01],[G01N],[G01N33/2025],[inventional],"[3, 4, 1, 0, 2]","[Hiromi, Masami, Kiyoshi, Masayoshi, Hiromichi]","[Umeda, Tomimoto, Matsuda, Okamura, Yamada]"
133538,9740948,0,fl:ke_ln:yoshida-46,Kenji,Yoshida,FALSE,zy8l73yf68ua6rjprzjkiicg6,US9740948-0,fl:ke_ln:yoshida,,...,,"[12, 4, 17, 15, 5, 18, 9, 13, 0, 10, 11, 19, 1...","[G, A, A, H, A, A, G, H, G, G, G, A, A, G, G, ...","[G07, A63, A63, H04, A63, A63, G06, H04, G06, ...","[G07F, A63F, A63F, H04N, A63F, A63F, G06K, H04...","[G07F17/322, A63F13/213, A63F2001/008, H04N5/2...","[inventional, inventional, additional, inventi...",[0],[Kenji],[Yoshida]
133539,10178129,2,fl:ju_ln:li-112,Jun,Li,FALSE,j95fb46xte3bfd4183j6np00o,US10178129-2,fl:ju_ln:li,,...,[3.0],"[0, 1, 2, 3]","[H, H, H, H]","[H04, H04, H04, H04]","[H04L, H04L, H04L, H04L]","[H04L63/20, H04L63/0218, H04L63/0236, H04L63/104]","[inventional, inventional, inventional, additi...","[0, 1, 2]","[Yibin, Bing, Jun]","[Xu, Sun, Li]"


## Export

#### Base data

In [15]:
data_columns = [
    "mention_id",
    "block",
    "patent_id",
    "inventor_sequence",
    "raw_inventor_name_first",
    "raw_inventor_name_last",
    "deceased_flag",
    "raw_city",
    "raw_state",
    "raw_country",
    "raw_attorney_name_first",
    "raw_attorney_name_last",
    "raw_attorney_organization",
    "attorney_country",
    "attorney_sequence",
    "patent_type",
    "patent_date",
    "patent_title",
    "patent_abstract",
    "wipo_kind",
    "num_claims",
    "withdrawn",
    "patent_application_type",
    "filing_date",
    "series_code",
    "rule_47_flag",
    "assignee_sequence",
    "raw_assignee_individual_name_first",
    "raw_assignee_individual_name_last",
    "raw_assignee_organization",
    "assignee_type",
    "cpc_sequence",
    "cpc_section",
    "cpc_class",
    "cpc_subclass",
    "cpc_group",
    "cpc_type",
    "coinventor_sequence",
    "coinventor_name_first",
    "coinventor_name_last"
]

inventor_subset[data_columns].to_parquet("pv-data.parquet", engine="pyarrow", index=False)

#### Reference disambiguation

In [16]:
inventor_subset[["mention_id", "unique_id"]].to_parquet("pv-reference.parquet", engine="pyarrow", index=False)

#### Predictions

In [17]:
persistent_inventor_subset = persistent_inventor[persistent_inventor["mention_id"].isin(inventor_subset["mention_id"])]

In [18]:
persistent_inventor_subset.to_parquet("pv-predictions.parquet", engine="pyarrow", index=False)