In [1]:
import pandas as pd

### Load data

In [2]:
inventor = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_inventor_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
attorney = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_attorney_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
location = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_location_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
patent = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_patent.tsv.zip", sep="\t", dtype=str, compression="zip")
application = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_application.tsv.zip", sep="\t", dtype=str, compression="zip")
assignee = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_assignee_not_disambiguated.tsv.zip", sep="\t", dtype=str, compression="zip")
cpc_current = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_cpc_current.tsv.zip", sep="\t", dtype=str, compression="zip")
persistent_inventor = pd.read_csv("https://s3.amazonaws.com/data.patentsview.org/download/g_persistent_inventor.tsv.zip", sep="\t", dtype=str, compression="zip")

In [None]:
from pv_evaluation.benchmark import load_binette_2022_inventors_benchmark

binette_benchmark = load_binette_2022_inventors_benchmark()

Add "mention_id"

In [None]:
persistent_inventor['mention_id'] = "US" + persistent_inventor["patent_id"] + "-" + persistent_inventor["sequence"]
inventor['mention_id'] = "US" + inventor["patent_id"] + "-" + inventor["inventor_sequence"]

Add block IDs

In [None]:
inventor["block"] = inventor["inventor_id"].str.replace("-[1-9]+", "", regex=True)

### Subset inventor mentions to blocks which intersect benchmark

In [None]:
pv_disamb = persistent_inventor.set_index("mention_id")["disamb_inventor_id_20211230"]

intersecting_blocks = inventor.set_index("mention_id").loc[binette_benchmark.index.values, "block"]

# Subset to sampled blocks
inventor_subset = inventor[inventor["block"].isin(intersecting_blocks)]

# Subset to inventor mentions which appear in the 2022/12/31 disambiguation
inventor_subset = inventor_subset[inventor_subset["mention_id"].isin(pv_disamb.index)]

### Add features

#### Ground truth

In [None]:
# Add ground truth clusters as "unique_id"
inventor_subset = inventor_subset.merge(binette_benchmark.reset_index(), on="mention_id", how="left")
inventor_subset

#### Location

In [None]:
inventor_subset = inventor_subset.merge(location, on="rawlocation_id", how="left")
inventor_subset

#### Attorney

In [None]:
attorney_subset = attorney[attorney["patent_id"].isin(inventor_subset["patent_id"])]
attorney_by_patent = attorney_subset.groupby("patent_id").agg({
    "raw_attorney_name_first": list,
    "raw_attorney_name_last": list,
    "raw_attorney_organization": list,
    "attorney_country": list,
    "attorney_sequence": list
})
inventor_subset = inventor_subset.merge(attorney_by_patent, on="patent_id", how="left")
inventor_subset

#### Patent information

In [None]:
patent_subset = patent[patent["patent_id"].isin(inventor_subset["patent_id"])]
inventor_subset = inventor_subset.merge(patent_subset, on="patent_id", how="left")
inventor_subset

#### Application

In [None]:
application_subset = application[application["patent_id"].isin(inventor_subset["patent_id"])]
inventor_subset = inventor_subset.merge(application_subset, on="patent_id", how="left")
inventor_subset

#### Assignees

In [None]:
assignee_subset = assignee[assignee["patent_id"].isin(inventor_subset["patent_id"])]

assignee_by_patent = assignee_subset.groupby("patent_id").agg({
    "assignee_sequence":list,
    "raw_assignee_individual_name_first":list,
    "raw_assignee_individual_name_last":list,
    "raw_assignee_organization": list,
    "assignee_type":list
})
inventor_subset = inventor_subset.merge(assignee_by_patent, on="patent_id", how="left")
inventor_subset

#### CPC

In [None]:
cpc_current_subset = cpc_current[cpc_current["patent_id"].isin(inventor_subset["patent_id"])]
cpc_by_patent = cpc_current_subset.groupby("patent_id").agg({
    "cpc_sequence":list,
    "cpc_section": list,
    "cpc_class": list,
    "cpc_subclass": list,
    "cpc_group": list,
    "cpc_type": list,
})
inventor_subset = inventor_subset.merge(cpc_by_patent, on="patent_id", how="left")
inventor_subset

#### Co-inventors

In [None]:
coinventors_subset = inventor[inventor["patent_id"].isin(inventor_subset["patent_id"])]
coinventors_by_patent = coinventors_subset.groupby("patent_id").agg({
    "inventor_sequence":list,
    "raw_inventor_name_first": list,
    "raw_inventor_name_last": list,
})
coinventors_by_patent = coinventors_by_patent.rename(columns={
    "inventor_sequence": "coinventor_sequence",
    "raw_inventor_name_first": "coinventor_name_first",
    "raw_inventor_name_last": "coinventor_name_last"
    })
inventor_subset = inventor_subset.merge(coinventors_by_patent, on="patent_id", how="left")
inventor_subset

## Export

#### Base data

In [None]:
data_columns = [
    "mention_id",
    "block",
    "patent_id",
    "inventor_sequence",
    "raw_inventor_name_first",
    "raw_inventor_name_last",
    "deceased_flag",
    "raw_city",
    "raw_state",
    "raw_country",
    "raw_attorney_name_first",
    "raw_attorney_name_last",
    "raw_attorney_organization",
    "attorney_country",
    "attorney_sequence",
    "patent_type",
    "patent_date",
    "patent_title",
    "patent_abstract",
    "wipo_kind",
    "num_claims",
    "withdrawn",
    "patent_application_type",
    "filing_date",
    "series_code",
    "rule_47_flag",
    "assignee_sequence",
    "raw_assignee_individual_name_first",
    "raw_assignee_individual_name_last",
    "raw_assignee_organization",
    "assignee_type",
    "cpc_sequence",
    "cpc_section",
    "cpc_class",
    "cpc_subclass",
    "cpc_group",
    "cpc_type",
    "coinventor_sequence",
    "coinventor_name_first",
    "coinventor_name_last"
]

inventor_subset[data_columns].to_parquet("pv-data.parquet", engine="pyarrow", index=False)

#### Reference disambiguation

In [None]:
inventor_subset[["mention_id", "unique_id"]].to_parquet("pv-reference.parquet", engine="pyarrow", index=False)

#### Predictions

In [None]:
persistent_inventor_subset = persistent_inventor[persistent_inventor["mention_id"].isin(inventor_subset["mention_id"])]

In [None]:
persistent_inventor_subset.to_parquet("pv-predictions.parquet", engine="pyarrow", index=False)