In [2]:
import pandas as pd

from pv_evaluation.metrics import cluster_precision_recall
from pv_evaluation.benchmark import inspect_clusters_to_split

In [3]:
ref = pd.read_csv('data/eval_als.txt', sep='\t', header=None, names=['mention', 'cluster'], dtype={'mention': 'string', 'cluster': 'int16'})
ref['mention'] = ref.apply(lambda x: "US" + x.mention, axis=1)

#convert to series
ref.set_index('mention', inplace=True)
ref_series = ref.iloc[:, 0]
ref_series

mention
US5294443-0       0
US6207855-1       1
US5767288-2       2
US4996481-1       3
US6727070-0       4
               ... 
US5150706-0     493
US7556935-2    1516
US8405044-1    2534
US7088104-0    1517
US6068972-0     528
Name: cluster, Length: 41347, dtype: int16

In [4]:
#loading result and id datasets for merge
pred_result = pd.read_csv('data/autosequence_cleaned.csv', index_col = 0)
pred_result['sequence'] = pd.to_numeric(pred_result['sequence'], errors='coerce').astype(pd.Int16Dtype())
pred_id = pd.read_csv('data/patents_2005_012.tsv', sep='\t', usecols=['id11', 'patent', 'fname', 'mname', 'lname', 'suffix'])
pred_id.drop_duplicates(inplace=True)

#join
pred = pd.merge(pred_result, pred_id, on=['patent', 'fname', 'mname', 'lname', 'suffix'])

#convert to series
pred.set_index('mention', inplace=True)
pred_series = pred['id11']
pred_series.info()

<class 'pandas.core.series.Series'>
Index: 142400 entries, US6205043-1 to US6830895-1
Series name: id11
Non-Null Count   Dtype
--------------   -----
142400 non-null  int64
dtypes: int64(1)
memory usage: 2.2+ MB


In [14]:
pred.loc['US6495023-0']

patent      6495023
fname       Gregory
mname            J.
lname        Zeikus
suffix            &
sequence          0
id11          60228
Name: US6495023-0, dtype: object

In [15]:
pred.loc['US4352885-0']

patent      4352885
fname        Joseph
mname       Gregory
lname        Zeikus
suffix            &
sequence          0
id11          60231
Name: US4352885-0, dtype: object

In [5]:
print("Predicted index duplicates:", pred_series.index.duplicated().sum())
print("Reference index duplicates:", ref_series.index.duplicated().sum())

Predicted index duplicates: 0
Reference index duplicates: 0


In [11]:
print("Pred length:", len(pred_series))
print("Ref length:", len(ref_series))
print("Join length:", len(join))

Pred length: 142400
Ref length: 41347
Join length: 22650


In [5]:
join = pd.concat({"pred":pred_series, "ref":ref_series}, axis=1, join="inner")

print(cluster_precision_recall(join.pred, join.ref))

(0.9997167941093175, 0.9963057686842853)


In [6]:
len(join)

22650

In [13]:
from pv_evaluation.benchmark import inspect_clusters_to_merge

inspect_clusters_to_merge(join.pred, join.ref)

Unnamed: 0_level_0,reference,prediction
mention,Unnamed: 1_level_1,Unnamed: 2_level_1
US5656497-0,25,60231
US4737459-0,25,60231
US5980890-1,25,60231
US5908924-1,25,60231
US6495023-0,25,60228
...,...,...
US5459317-0,3061,49630
US6534062-2,3097,44221
US6686462-4,3097,44222
US6111071-0,3262,17617
