In [2]:
import pandas as pd
import numpy as np

import wget
import zipfile
import os

if not os.path.isfile("rawinventor.tsv"):
    wget.download("https://s3.amazonaws.com/data.patentsview.org/download/rawinventor.tsv.zip")
    with zipfile.ZipFile("rawinventor.tsv.zip", 'r') as zip_ref:
        zip_ref.extractall(".")
    os.remove("rawinventor.tsv.zip")

rawinventor = pd.read_csv("rawinventor.tsv", sep="\t")
rawinventor["mention_id"] = "US" + rawinventor.patent_id.astype(str) + "-" + rawinventor.sequence.astype(str)
disambiguation = rawinventor.set_index("mention_id")["inventor_id"]

In [2]:
benchmark = pd.read_csv("inventor_benchmark_binette_20.csv", sep="\t").fillna("").iloc[0:20, :]

In [3]:
def lambd(x):
    cluster = disambiguation[disambiguation == x.inventor_id].index.values
    to_add = np.setdiff1d(x["add"].split(", "), [""])
    cluster = np.append(cluster, to_add)
    to_remove = np.setdiff1d([string.strip() for string in x["remove"].split(",")], [""])
    if len(to_remove) > 0:
        assert all(mention in cluster for mention in to_remove), f"{to_remove[np.array([mention not in cluster for mention in to_remove])]}"
        cluster =  np.setdiff1d(cluster, to_remove)
    return cluster

In [4]:
true_clusters = benchmark.apply(lambd, axis=1)
reference = pd.concat({"inventor_id":benchmark.inventor_id, "mention_id":true_clusters}, axis=1).explode("mention_id").set_index("mention_id")["inventor_id"]

In [5]:
from pv_evaluation.estimators import pairwise_precision_estimator, pairwise_precision_std

pairwise_precision_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.9759856470977679

In [6]:
pairwise_precision_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.020129014582105607

## Emma results 2022-06-22

In [7]:
benchmark = pd.read_excel("2022-06-22-Emma_pantentSample.xlsx").fillna("")

In [8]:
true_clusters = benchmark.apply(lambd, axis=1)
reference = pd.concat({"inventor_id":benchmark.inventor_id, "mention_id":true_clusters}, axis=1).explode("mention_id").set_index("mention_id")["inventor_id"]

In [9]:
from pv_evaluation.estimators import pairwise_precision_estimator, pairwise_precision_std

pairwise_precision_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.8608282020727221

In [10]:
pairwise_precision_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.055797239309423874

In [11]:
from pv_evaluation.estimators import pairwise_recall_estimator, pairwise_recall_std

pairwise_recall_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.9421315090119152

In [12]:
pairwise_recall_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.021200869250555428

## Aida Results 2022-06-22

In [13]:
benchmark = pd.read_excel("2022-06-22-Aida-Patent_samples.xlsx").fillna("")

In [14]:
true_clusters = benchmark.apply(lambd, axis=1)
reference = pd.concat({"inventor_id":benchmark.inventor_id, "mention_id":true_clusters}, axis=1).explode("mention_id").set_index("mention_id")["inventor_id"]

In [15]:
reference[reference.index.duplicated()]

Series([], Name: inventor_id, dtype: object)

In [16]:
from pv_evaluation.estimators import pairwise_precision_estimator, pairwise_precision_std

pairwise_precision_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.8757777024662486

In [17]:
pairwise_precision_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.05581127800247002

In [18]:
from pv_evaluation.estimators import pairwise_recall_estimator, pairwise_recall_std

pairwise_recall_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.9465825412741339

In [19]:
pairwise_recall_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.020033335501181773

## Aida Results 2022-07-25

In [19]:
benchmark = pd.read_excel("2022-07-25-Aida-patent-samples-part-2.xlsx").fillna("")
true_clusters = benchmark.apply(lambd, axis=1)
reference = pd.concat({"inventor_id":benchmark.inventor_id, "mention_id":true_clusters}, axis=1).explode("mention_id").set_index("mention_id")["inventor_id"]

In [20]:
reference[reference.index.duplicated()]

Series([], Name: inventor_id, dtype: object)

In [21]:
from pv_evaluation.estimators import pairwise_precision_estimator, pairwise_precision_std

pairwise_precision_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.8730249942340461

In [22]:
pairwise_precision_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.03618313103980752

In [23]:
from pv_evaluation.estimators import pairwise_recall_estimator, pairwise_recall_std

pairwise_recall_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.9574971592453838

In [24]:
pairwise_recall_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.010205193669155926

## Emma Results 2022-07-25

In [28]:
benchmark = pd.read_excel("2022-07-25-Emma-patent-samples-part-2.xlsx").fillna("")
true_clusters = benchmark.apply(lambd, axis=1)
reference = pd.concat({"inventor_id":benchmark.inventor_id, "mention_id":true_clusters}, axis=1).explode("mention_id").set_index("mention_id")["inventor_id"]

In [29]:
reference[reference.index.duplicated()]

Series([], Name: inventor_id, dtype: object)

In [30]:
from pv_evaluation.estimators import pairwise_precision_estimator, pairwise_precision_std

pairwise_precision_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.8791258159485422

In [31]:
pairwise_precision_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.03391972460493254

In [32]:
from pv_evaluation.estimators import pairwise_recall_estimator, pairwise_recall_std

pairwise_recall_estimator(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.9490388690316084

In [33]:
pairwise_recall_std(disambiguation, reference, sampling_type="cluster_block", weights="cluster_size")

0.011276852957939726