# Patent Paper Pair Matching

## Imports and important funcs

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import time
import json
import pyalex
import numpy as np
import pandas as pd
import requests_cache
import dask.dataframe as dd

from functools import reduce
from operator import ior
from tqdm import tqdm
from typing import List, Tuple, Dict, Any
from json import JSONDecodeError
from collections import defaultdict
from datetime import datetime, timedelta
from pyalex import Institutions

data_folder = '/Folder/To/Your/Data'

In [None]:
def batchify(iterable, n=50):
    """
    Provides an output iterable returning batches of size `n` of a given input iterable.
    """
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx : min(ndx + n, l)]


def convert_date(date_string, format="%Y-%m-%d"):
    """
    Converts a date string in the format YYYY-MM-DD to a datetime.date object.
    """
    return datetime.strptime(date_string, format).date()


In [6]:
from src.openalex_db import OpenAlexLocalDB
openalex_db = OpenAlexLocalDB(data_folder + "start/openalex.duckdb")

In [7]:
# Useful for long fetching loops, in case we need to re-run the same query
requests_cache.install_cache(
    data_folder + "cache/openalex/openalex.sqlite", backend="sqlite", expire_after=None
)

In [8]:
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 200

# Patents

In [10]:
# Full gov list of patents
patents = dd.read_parquet(data_folder + "start/dask/patents.parquet/")

In [None]:
# Remove NaNs and duplicates
patents = patents.dropna().sort_values("patent_filing_date").drop_duplicates(
    subset=['patent_title', 'patent_assignee'], keep='first',
)

In [83]:
# Keep only patents with assignee country JP
patents = patents[patents["patent_assignee_country"] == "JP"]

In [84]:
patents['patent_filing_year'] = \
    patents['patent_filing_date'].str.split('-').str[0].astype(int)

In [None]:
print(f"Num patents before date filtering: {patents.shape[0]}")
min_year = 2004
max_year = 2018
patents = patents[(patents['patent_filing_year'] >= min_year) & (patents['patent_filing_year'] <= max_year)]
f"Num patents after date filtering: {patents.shape[0]}"

### Filter by assignee

In [87]:
# Our list of 35 national research institutions to consider
research_institutions_list = None # Provide your own list of research institutions here
# Remove RIKEN (will be handled manually)
research_institutions_list = [name for name in research_institutions_list if name != "RIKEN"]
research_institutions = {name: name.lower() for name in research_institutions_list}

In [88]:
universities_exclude_list = [
    "Polytechnic University",
    "Tama University",
    "Ohu University",
]

In [89]:
assignees_to_exclude_list = [
    "Toyota Technical Institute At Chicago",
]

In [None]:
universities_df = None # Provide your own list of universities here
print(f"Number of universities in file: {len(universities_df)}")
universities_df = universities_df[~universities_df['name_en'].isin(universities_exclude_list)]
print(f"Number of universities after excluding: {len(universities_df)}")
universities = {name_en: name_en.lower() for name_en in universities_df['name_en'].tolist()}
universities_jp_name_mapping = {name_en: name_jp for name_en, name_jp in zip(universities_df['name_en'], universities_df['name_jp'])}

In [91]:
entity_names = research_institutions | universities

In [None]:
entity_mask = patents["patent_assignee"].str.lower().str.contains('|'.join(entity_names.values()), na=False, regex=True)
# use exact match for RIKEN
riken_mask = patents["patent_assignee"].str.lower() == "riken"
patents = patents[entity_mask | riken_mask].compute()

# exclude the assignees in exclusion list
patents = patents[~patents["patent_assignee"].str.lower().isin(assignees_to_exclude_list)]

f"Num patents after assignee filtering: {patents.shape[0]}"

In [93]:
entity_type = {}
for entity in research_institutions.keys():
    entity_type[entity] = "institution"
for entity in universities.keys():
    entity_type[entity] = "university"
entity_type["RIKEN"] = "institution"

In [94]:
# add jp entity name
patents["entity_name_jp"] = patents["patent_assignee"].map(universities_jp_name_mapping)

In [None]:
# Create an 'entity_name' column
patents["entity_name"] = "Unknown"

# for each entity, add it to the entity_name column
for entity, entity_lower in entity_names.items():
    entity_mask = patents["patent_assignee"].str.lower().str.contains(entity_lower, na=False, regex=False)
    if entity_mask.sum() > 0:
        patents.loc[entity_mask, "entity_name"] = entity

patents.loc[patents['patent_assignee'] == "RIKEN", "entity_name"] = "RIKEN"

patents["entity_name"].value_counts(dropna=False)

In [None]:
# add entity type
patents["entity_type"] = patents["entity_name"].map(entity_type)
patents.loc[patents['entity_name'] == "RIKEN", "entity_type"] = "institution"
patents["entity_type"].value_counts(dropna=False)

In [98]:
# produce csv of number of patents per entity_name + assignee pair
patent_counts = patents.groupby(["entity_name", "patent_assignee"]).size().reset_index(name='counts').sort_values(["entity_name", "counts"], ascending=[True, False])
patent_counts.to_csv(data_folder + "result/patent_counts_by_entity_assignee.csv", index=False)

# produce a universities only and a institutions only csv
patent_counts_university = patents[patents["entity_type"] == "university"].groupby(["entity_name", "patent_assignee"]).size().reset_index(name='counts').sort_values(["entity_name", "counts"], ascending=[True, False])
patent_counts_university.to_csv(data_folder + "result/patent_counts_by_university_assignee.csv", index=False)
patent_counts_institution = patents[patents["entity_type"] == "institution"].groupby(["entity_name", "patent_assignee"]).size().reset_index(name='counts').sort_values(["entity_name", "counts"], ascending=[True, False])
patent_counts_institution.to_csv(data_folder + "result/patent_counts_by_institution_assignee.csv", index=False)

In [99]:
entity_counts = patents.groupby("entity_name").size().reset_index(name="num_patents")
entity_counts_mapping = {entity_name: num_patents for entity_name, num_patents in zip(entity_counts["entity_name"], entity_counts["num_patents"])}

In [100]:
# produce a list of entity patent counts, including 0
entities_df = pd.DataFrame({"entity_name": list(entity_names.keys()) + ["RIKEN"]})
entities_df['entity_type'] = entities_df['entity_name'].map(entity_type)
entities_df['num_patents'] = entities_df['entity_name'].map(entity_counts_mapping).fillna(0).astype(int)
entities_df.sort_values(["num_patents", "entity_name"], ascending=[False, True], inplace=True)
entities_df.to_csv(data_folder + "result/entity_patent_counts.csv", index=False)

# also produce one with only university names, and one with only institution names
entities_df[entities_df["entity_type"] == "university"].to_csv(data_folder + "result/university_patent_counts.csv", index=False)
entities_df[entities_df["entity_type"] == "institution"].to_csv(data_folder + "result/institution_patent_counts.csv", index=False)


In [103]:
# patents checkpoint
patents.to_parquet(data_folder + "/start/parquet/patents_cleaned_assignee_filtered.parquet")

In [None]:
# patents checkpoint load
patents = pd.read_parquet(data_folder + "/start/parquet/patents_cleaned_assignee_filtered.parquet")
f"Num patents loaded: {patents.shape[0]}"

In [None]:
# patents of each assignee type
patents["patent_assignee"].value_counts()

In [None]:
patents['entity_type']

In [None]:
# Show counts of unique assignees and entity names by entity type
print("Number of unique assignees by entity type:")
print(patents.groupby("entity_type")["patent_assignee"].nunique())
print("\nNumber of unique entity names by entity type:")
print(patents.groupby("entity_type")["entity_name"].nunique())

In [None]:
patents[['entity_name', 'patent_assignee', 'patent_id']].head(10)

In [None]:
patents.shape

In [None]:
institutions_to_search = patents["entity_name"].unique().tolist()
data = []

for entity_name in tqdm(institutions_to_search):
    institution_matches = Institutions().search(entity_name.lower()).get()
    for match in institution_matches:
        data.append({
            'entity_name': entity_name,
            'institution_display_name': match['display_name'],
            'institution_id': match['id']
        })

df = pd.DataFrame(data)
df = df.sort_values('entity_name')

df.to_csv(data_folder + "start/csv/entities_institution_ids.csv", index=False)

In [37]:
with open(data_folder + "start/csv/entities_institution_ids.csv", "r") as f:
    entities_institution_ids = pd.read_csv(f)

### Patent Inventors

In [22]:
inventors = dd.read_parquet(data_folder + "start/dask/inventors.parquet")

In [None]:
inventors = inventors.merge(patents[['patent_id']], on='patent_id', how='inner').compute()
f"Num inventors associated with our patents: {inventors.shape[0]}"

In [25]:
inventors["inventor_full_name"] = inventors["inventor_first_name"] + " " + inventors["inventor_last_name"]

### Patent References

In [26]:
# Download your own version of marx patent data from the original source
marx_references = dd.read_parquet(data_folder + "start/marx_patent_references.parquet")

In [27]:
patents["patent_id_us"] = "US-" + patents.patent_id.astype(str)

In [28]:
patent_references = marx_references.merge(patents, how="inner", left_index=True, right_on="patent_id_us")

In [29]:
patent_references = patent_references[['patent_id', 'doi', 'patent_id_us']].drop_duplicates()

# Works (papers)
We will fetch works (papers) from openalex based on candidancy. All papers by authors with a matching mame will be considered as potential matches.

## Author candidates

In [30]:
def handle_inventor_name(inventor_name: str) -> str:
    if type(inventor_name) == str and "," in inventor_name:
        inventor_name = inventor_name.split(",")[0]
    return inventor_name

inventor_list = [handle_inventor_name(inventor_name) for inventor_name in inventors["inventor_full_name"].unique().tolist()]

In [None]:
# filter out non-string and empty
inventor_list = [inventor for inventor in inventor_list if type(inventor) == str and inventor]
f"Number of unique inventor names: {len(inventor_list)}"

In [None]:
candidates = openalex_db.get_candidates(inventor_list)
candidates.shape

In [None]:
# Keep only candidates with an institution in our entity list
valid_institution_ids = set(entities_institution_ids["institution_id"].to_list())
def does_candidate_have_institution(row: pd.Series) -> bool:
    candidate_institution_ids = json.loads(row["institution_ids"])
    return any(institution_id in valid_institution_ids for institution_id in candidate_institution_ids)
candidate_has_institution = candidates.apply(does_candidate_have_institution, axis=1)
candidates_filtered = candidates[candidate_has_institution]
candidates_filtered.shape

In [39]:
candidates = candidates_filtered

## Work candidates

Based on the filtered author candidate list we now pull candidates' works from openalex.

In [None]:
author_list = candidates["author_id"].unique().tolist()
len(author_list)

In [41]:
author_list = sorted(author_list)

In [None]:
works, authorships = openalex_db.get_works_by_author_ids(
    author_list,
    limit_author_position=["first", "last"],
    work_min_date="2000-01-01",
    work_max_date="2022-12-31",
)
f"Got {len(works)} works and {len(authorships)} authorships"

In [44]:
works = works.rename(columns={
    "doi": "work_doi",
    "title": "work_title",
    "abstract": "work_abstract",
    "publication_date": "work_publication_date",
    "referenced_works": "work_referenced_works",
    "related_works": "work_related_works",
})

In [None]:
works.shape, authorships.shape

In [46]:
works['work_related_works'] = works['work_related_works'].apply(lambda x: json.loads(x))

In [47]:
works['work_referenced_works'] = works['work_referenced_works'].apply(lambda x: json.loads(x))

In [None]:
works = works.dropna(subset=["work_title"]).drop_duplicates(subset=["work_id"])
works.shape

In [None]:
authorships['institutions'] = authorships['institutions'].map(lambda x: json.loads(x) if type(x) == str else [])
authorships

In [None]:
# We only want to keep works where at least one author has an institution in our entity list
# Filter out those work_ids where the author's institution doesn't belong to the list, using ids
authorships_with_valid_institution_ids = authorships[authorships['institutions'].map(lambda x: any(institution_id in valid_institution_ids for institution_id in x))]
authorships_with_valid_institution_ids

In [None]:
works = works[works['work_id'].isin(authorships_with_valid_institution_ids['work_id'].unique())]
works.shape

In [None]:
# how many of these works have the word review in the title
works[works['work_title'].str.contains("review", na=False)].shape[0]

In [None]:
authorships = authorships[authorships['work_id'].isin(works['work_id'].unique())]
authorships.shape

In [51]:
# We use a mapping of related works for reproducibility and speed purposes when re-running the code
# Use existing mapping when available
# Comment this out if the file doesn't exist yet
with open(data_folder + "registry/related_works_mapping.json", "r") as f:
    related_works_mapping = json.load(f)

In [None]:
works_related = works[['work_id', 'work_related_works']]
works_related = works_related[works_related['work_related_works'].apply(lambda x: len(x) > 0)]
works_related = works_related.explode(column='work_related_works').rename(columns={'work_related_works': 'work_related_work_id'}).drop_duplicates()
f"Num related works: {len(works_related)}"

In [None]:
# show a graphical distribution of the number of related works per work_id, using pandas plot
works_related.groupby("work_id").size().plot(kind="hist", bins=100)

In [54]:
# Take one random related work for each work in works_related
# Set the seed for reproducibility
works_related_random_sample = works_related.groupby("work_id").apply(lambda x: x.sample(1, random_state=42)).set_index("work_id").to_dict()["work_related_work_id"]

In [None]:
works_related = works[['work_id']]
works_related['work_related_work_id'] = works_related['work_id'].apply(lambda x: related_works_mapping.get(x, None) if x in related_works_mapping else works_related_random_sample.get(x, None))
works_related.shape

In [None]:
work_ids_to_fetch = works_related['work_related_work_id'].dropna().unique().tolist()
len(work_ids_to_fetch)

In [None]:
works_related_pulled, authorships_related = openalex_db.get_works_by_work_ids(work_ids_to_fetch)
works_related_pulled.shape, authorships_related.shape

In [61]:
works_related_pulled = works_related_pulled.rename(columns={
    "doi": "work_doi",
    "title": "work_title",
    "abstract": "work_abstract",
    "publication_date": "work_publication_date",
    "referenced_works": "work_referenced_works",
    "related_works": "work_related_works",
})

In [None]:
works_related = works_related.merge(works_related_pulled, left_on="work_related_work_id", right_on="work_id", how="left", suffixes=("", "_related")).drop(columns=["work_id_related"])
print("Num pulled related works: ", len(works_related))

In [63]:
# Update the related_works_mapping json file with the new works
for work_id, related_work_id in works_related_random_sample.items():
    if work_id not in related_works_mapping:
        related_works_mapping[work_id] = related_work_id
with open(data_folder + "registry/related_works_mapping.json", "w") as f:
    json.dump(related_works_mapping, f)

# Matching

In this phase we will first generate valid patent-work pairs based on time range. Then, we fetch DOIs for all work references belonging to valid match pairs. Finally, we can compute the scores for each pair.

## Get paper - work pairs
Match patents to publications (works), and also find an appropriate negative match by looking at related works.

In [None]:
# Potential Match merge route: Patent -> Inventor -> Candidate -> Work
potential_matches = patents \
    .merge(right=inventors, on="patent_id", how="inner") \
    .merge(right=candidates, on="inventor_full_name", how="inner") \
    .merge(right=authorships[['author_id', 'work_id', 'author_position']], on="author_id", how="inner") \
    .merge(right=works, on="work_id", how="inner") \
    .sort_values(["patent_id", "work_id"]) \
    .drop_duplicates(subset=["patent_id", "work_id"], keep="first")
potential_matches.shape

In [69]:
potential_matches["relative_filing_date"] = potential_matches.apply(
    lambda row: (
        row["work_publication_date"].to_pydatetime().date() - convert_date(row["patent_filing_date"])
    ).days,
    axis=1,
)

In [70]:
# Add a boolean column to indicate whether the work is within the acceptable year range
potential_matches["is_valid_range"] = \
    (potential_matches["relative_filing_date"] < 365 * 2) & \
    (potential_matches["relative_filing_date"] > -365)

In [None]:
# How many are valid?
potential_matches["is_valid_range"].value_counts()

In [None]:
potential_matches = potential_matches[potential_matches["is_valid_range"]]
f"Num valid potential matches: {len(potential_matches)}"

## Pull work reference dois from Openalex

In [None]:
potential_matches[["work_id", "work_referenced_works"]].sample(10)

In [None]:
# narrow down works_related to only include works that are referenced by the potential_matches
print(f"Num potential matches: {len(potential_matches)}")
print(f"Num works related: {len(works_related)}")
works_related = works_related[works_related["work_id"].isin(potential_matches["work_id"].unique())]
print(f"Num works related after narrowing down: {len(works_related)}")

In [None]:
work_reference_ids = (
    potential_matches["work_referenced_works"].explode().unique().tolist()
)
related_work_reference_ids = (
    works_related["work_referenced_works"]
    .map(lambda x: json.loads(x) if type(x) == str else [])
    .explode()
    .unique()
    .tolist()
)
len(work_reference_ids), len(related_work_reference_ids)

In [None]:
work_reference_ids = list(filter(lambda x: x and type(x) == str and '/' in x, work_reference_ids))
related_work_reference_ids = list(filter(lambda x: x and type(x) == str and '/' in x, related_work_reference_ids))
len(work_reference_ids), len(related_work_reference_ids)

In [None]:
works_references = openalex_db.get_work_dois(work_reference_ids)
related_works_references = openalex_db.get_work_dois(related_work_reference_ids)

In [86]:
works_references["doi"] = works_references["doi"].str[16:]
related_works_references["doi"] = related_works_references["doi"].str[16:]

In [None]:
work_id_to_doi = works_references[["work_id", "doi"]].set_index("work_id").to_dict()["doi"]
work_id_to_doi.update(
    related_works_references[["work_id", "doi"]].set_index("work_id").to_dict()["doi"]
)
print(len(work_id_to_doi))

In [None]:
# Merge negative related work information
print(potential_matches.shape)
potential_matches = potential_matches.merge(
    right=works_related,
    on="work_id",
    suffixes=("", "_negative"),
    how="left",
).drop_duplicates(subset=["patent_id", "work_id"])
potential_matches.shape

In [94]:
potential_matches["referenced_works_dois"] = potential_matches[
    "work_referenced_works"
].map(
    lambda x: list(
        filter(None, [work_id_to_doi.get(work_id) for work_id in x])
        if type(x) == list
        else []
    )
)

In [96]:
potential_matches["work_referenced_works_dois_negative"] = potential_matches[
    "work_referenced_works_negative"
].map(
    lambda x: list(
        filter(None, [work_id_to_doi.get(work_id) for work_id in json.loads(x)])
    )
    if type(x) == str
    else []
)

In [None]:
potential_matches = potential_matches.merge(
    right=patent_references.groupby("patent_id")["doi"]
    .apply(list)
    .to_frame()
    .reset_index()
    .compute(),
    on="patent_id",
    how="left",
).rename(columns={"doi": "patent_references_dois"})

In [None]:
authorships.rename(columns={"full_name": "author_full_name"}, inplace=True)
authorships_related.rename(columns={"full_name": "author_full_name"}, inplace=True)

In [102]:
# Add author list
potential_matches = potential_matches.merge(
    right=authorships[["work_id", "author_full_name"]]
    .groupby("work_id")
    .agg(list)
    .rename(columns={"author_full_name": "author_list"}),
    on="work_id",
    how="left",
)

# Add negative author list
potential_matches = potential_matches.merge(
    right=authorships_related[["work_id", "author_full_name"]]
    .groupby("work_id")
    .agg(list)
    .rename(columns={"author_full_name": "author_list_negative"}),
    left_on="work_related_work_id",
    right_index=True,
    how="left",
)

# Add inventor list
potential_matches = potential_matches.merge(
    right=inventors[["patent_id", "inventor_full_name"]]
    .groupby("patent_id")
    .agg(list)
    .rename(columns={"inventor_full_name": "inventor_list"}),
    on="patent_id",
    how="left",
)


## Compute scores


In [120]:
from src.scores import inventor_overlap_score, doi_overlap_score, semantic_similarity_score_spacy, semantic_similarity_score_word_overlap

In [None]:
potential_matches.columns

In [106]:
potential_matches["inventor_score"] = potential_matches.apply(
    lambda row: inventor_overlap_score(row['inventor_list'], row['author_list']), axis=1,
)
potential_matches["inventor_score_negative"] = potential_matches.apply(
    lambda row: inventor_overlap_score(row["inventor_list"], row["author_list_negative"]), axis=1,
)

In [None]:
# Here we remove those matches where the inventor score is 0
# This situation probably means that openalex fetched some works where the author name doesn't exactly match
potential_matches = potential_matches[potential_matches["inventor_score"] > 0]
potential_matches.shape

In [111]:
potential_matches.loc[:,"doi_overlap_score"] = potential_matches.apply(
    lambda row: doi_overlap_score(
        row["patent_references_dois"], row["referenced_works_dois"]
    ),
    axis=1,
)

potential_matches.loc[:,"doi_overlap_score_negative"] = potential_matches.apply(
    lambda row: doi_overlap_score(
        row["patent_references_dois"], row["work_referenced_works_dois_negative"]
    ),
    axis=1,
)

In [113]:
potential_matches.loc[:, [
    "patent_title", 
    "patent_abstract", 
    "work_title", 
    "work_abstract", 
    "work_title_negative", 
    "work_abstract_negative"
]] = \
    potential_matches[[
    "patent_title", 
    "patent_abstract", 
    "work_title", 
    "work_abstract", 
    "work_title_negative", 
    "work_abstract_negative"
]].fillna("")

In [None]:
print(f"Calculating title semantic scores")
potential_matches.loc[:,"title_semantic_score"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_title"], row["work_title"]),
    axis=1
)

print(f"Calculating title semantic scores negative")
potential_matches.loc[:,"title_semantic_score_negative"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_title"], row["work_title_negative"]),
    axis=1,
)

print(f"Calculating abstract semantic scores")
potential_matches.loc[:,"abstract_semantic_score"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_abstract"], row["work_abstract"]),
    axis=1,
)

print(f"Calculating abstract semantic scores negative")
potential_matches.loc[:,"abstract_semantic_score_negative"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_abstract"], row["work_abstract_negative"]),
    axis=1,
)

print(f"Calculating semantic score titleabstract mean")
potential_matches.loc[:,"semantic_score_titleabstract_mean"] = potential_matches.apply(
    lambda row: (row['title_semantic_score'] + row['abstract_semantic_score']) / 2,
    axis=1,
)

print(f"Calculating semantic score titleabstract mean negative")
potential_matches.loc[:,"semantic_score_titleabstract_mean_negative"] = potential_matches.apply(
    lambda row: (row['title_semantic_score_negative'] + row['abstract_semantic_score_negative']) / 2,
    axis=1,
)


In [None]:
# SBERT scores
print(f"Calculating title semantic scores sbert")
potential_matches.loc[:,"title_semantic_score_sbert"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_title"], row["work_title"]),
    axis=1
)

print(f"Calculating title semantic scores sbert negative")
potential_matches.loc[:,"title_semantic_score_sbert_negative"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_title"], row["work_title_negative"]),
    axis=1,
)

print(f"Calculating abstract semantic scores sbert")
potential_matches.loc[:,"abstract_semantic_score_sbert"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_abstract"], row["work_abstract"]),
    axis=1,
)

print(f"Calculating abstract semantic scores sbert negative")
potential_matches.loc[:,"abstract_semantic_score_sbert_negative"] = potential_matches.apply(
    lambda row: semantic_similarity_score_spacy(row["patent_abstract"], row["work_abstract_negative"]),
    axis=1,
)

print(f"Calculating semantic score titleabstract sbert mean")
potential_matches.loc[:,"semantic_score_titleabstract_sbert_mean"] = potential_matches.apply(
    lambda row: (row['title_semantic_score_sbert'] + row['abstract_semantic_score_sbert']) / 2,
    axis=1,
)

print(f"Calculating semantic score titleabstract sbert mean negative")
potential_matches.loc[:,"semantic_score_titleabstract_sbert_mean_negative"] = potential_matches.apply(
    lambda row: (row['title_semantic_score_sbert_negative'] + row['abstract_semantic_score_sbert_negative']) / 2,
    axis=1,
)

In [None]:

print(f"Calculating title overlap score")
potential_matches["title_overlap_score"] = potential_matches.apply(
    lambda row: semantic_similarity_score_word_overlap(row["patent_title"], row["work_title"]),
    axis=1
)

print(f"Calculating title overlap score negative")
potential_matches["title_overlap_score_negative"] = potential_matches.apply(
    lambda row: semantic_similarity_score_word_overlap(row["patent_title"], row["work_title_negative"]),
    axis=1,
)

print(f"Calculating abstract overlap score")
potential_matches["abstract_overlap_score"] = potential_matches.apply(
    lambda row: semantic_similarity_score_word_overlap(row["patent_abstract"], row["work_abstract"]),
    axis=1,
)

print(f"Calculating abstract overlap score negative")
potential_matches["abstract_overlap_score_negative"] = potential_matches.apply(
    lambda row: semantic_similarity_score_word_overlap(row["patent_abstract"], row["work_abstract_negative"]),
    axis=1,
)

print(f"Calculating overlap score titleabstract mean")
potential_matches["overlap_score_titleabstract_mean"] = potential_matches.apply(
    lambda row: (row['title_overlap_score'] + row['abstract_overlap_score']) / 2,
    axis=1,
)

print(f"Calculating overlap score titleabstract mean negative")
potential_matches["overlap_score_titleabstract_mean_negative"] = potential_matches.apply(
    lambda row: (row['title_overlap_score_negative'] + row['abstract_overlap_score_negative']) / 2,
    axis=1,
)


In [123]:
# Add the number of references for each patent
potential_matches = potential_matches.merge(
    right=patent_references.groupby("patent_id").size().astype(int).rename("patent_num_references").compute(),
    how="left",
    left_on="patent_id",
    right_index=True,
)

In [125]:
# Add the number of inventors for each patent
potential_matches = potential_matches.merge(
    right=inventors.groupby("patent_id").size().astype(int).rename("patent_num_inventors"),
    how="left",
    left_on="patent_id",
    right_index=True,
)

In [127]:
# Add the number of authors for each paper
potential_matches = potential_matches.merge(
    right=authorships.groupby("work_id").size().astype(int).rename("paper_num_authors"),
    how="left",
    left_on="work_id",
    right_index=True,
)

In [131]:
# Add the relative publication date for the negative matches
def relative_filing_date_negative(row):
    if not row["work_publication_date_negative"]: return None
    if type(row["work_publication_date_negative"]) != str: return None
    return (row["work_publication_date_negative"].to_pydatetime().date() - convert_date(row["patent_filing_date"])).days
    
potential_matches["relative_filing_date_negative"] = potential_matches.apply(
    relative_filing_date_negative,
    axis=1,
)

In [134]:
final_results = potential_matches.sort_values(by=["patent_id", "work_id"])

In [135]:
final_results["relative_granted_date"] = final_results.apply(
    lambda row: (row["work_publication_date"].to_pydatetime().date() - convert_date(row["patent_date"])).days,
    axis=1,
)

In [None]:
# Filter out matches where there is a paper with 'review' in the title
print(f'Num matches before filtering review papers: {len(final_results)}')
final_results = final_results[~(final_results['work_title'].str.contains('review', case=False))]
print(f'Num matches after filtering review papers: {len(final_results)}')

In [137]:
final_results.to_parquet(data_folder + "result/final_results.parquet")

In [None]:
final_results.to_excel(data_folder + "result/final_results.xlsx", engine="xlsxwriter")