In [23]:
%load_ext autoreload
%autoreload 2

import pandas as pd

import sys
sys.path.append("..")

from heritageconnector.disambiguation.helpers import load_training_data
from heritageconnector.disambiguation.pipelines import Disambiguator
from heritageconnector.disambiguation.postprocessing import filter_cased_wikidata_labels, remove_wikidata_items_with_min_claims, filter_max_wikidata_links

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Load data and classifier

In [27]:
# test_dir = "/Volumes/Kalyan_SSD/SMG/disambiguation/objects_131120/test_computing_space/"
# test_dir = "/Volumes/Kalyan_SSD/SMG/disambiguation/objects_131120/test_photographic_aeronautics/"
test_dir = "/Volumes/Kalyan_SSD/SMG/disambiguation/objects_131120/test_art_locomotives_and_rolling_stock/"
test_dir = "/Volumes/Kalyan_SSD/SMG/disambiguation/objects_131120/test_locomotives_and_rolling_stock/"

X, pairs, pids = load_training_data(test_dir)

In [28]:
d = Disambiguator('OBJECT')
d.load_classifier_from_disk("/Volumes/Kalyan_SSD/SMG/disambiguation/objects_131120/clf.pkl")
d.print_tree(feature_names=pids)

|--- label <= 0.97
|   |--- label <= 0.80
|   |   |--- class: False
|   |--- label >  0.80
|   |   |--- label <= 0.81
|   |   |   |--- class: True
|   |   |--- label >  0.81
|   |   |   |--- label <= 0.81
|   |   |   |   |--- class: False
|   |   |   |--- label >  0.81
|   |   |   |   |--- class: False
|--- label >  0.97
|   |--- P31 <= 0.13
|   |   |--- P31 <= 0.00
|   |   |   |--- class: True
|   |   |--- P31 >  0.00
|   |   |   |--- class: True
|   |--- P31 >  0.13
|   |   |--- P31 <= 0.75
|   |   |   |--- class: True
|   |   |--- P31 >  0.75
|   |   |   |--- class: True



## 2. Predict SMG-Wikidata links using classifier

In [29]:
pairs_pred = d.get_predictions_table(X, pairs, threshold=0.9)
pairs_pred.sort_values('y_pred_proba', ascending=False).head()

Unnamed: 0,internal_id,wikidata_id,is_type,y_pred_proba,y_pred
978,https://collection.sciencemuseumgroup.org.uk/objects/co8060525,Q785745,True,0.972222,True
1678,https://collection.sciencemuseumgroup.org.uk/objects/co205761,Q4231608,True,0.972222,True
3145,https://collection.sciencemuseumgroup.org.uk/objects/co205759,Q24294815,True,0.972222,True
3521,https://collection.sciencemuseumgroup.org.uk/objects/co8060526,Q26251835,True,0.972222,True
139,https://collection.sciencemuseumgroup.org.uk/objects/co207717,Q19842071,True,0.972222,True


## 3. Filter results

In [30]:
# first get only the positive predictions
pairs_pred_positive = pairs_pred[pairs_pred['y_pred'] == True]

len(pairs_pred_positive)

141

In [31]:
pairs_pred_positive_filtered = filter_cased_wikidata_labels(pairs_pred_positive)
pairs_pred_positive_filtered = remove_wikidata_items_with_min_claims(pairs_pred_positive_filtered, 1)
pairs_pred_positive_filtered = filter_max_wikidata_links(pairs_pred_positive_filtered)

print(f"{len(pairs_pred_positive_filtered)} matches after filtering")

37 matches after filtering


In [32]:
pairs_pred_positive_filtered.sort_values('y_pred_proba', ascending=False).head(20)

Unnamed: 0,internal_id,wikidata_id,is_type,y_pred_proba,y_pred
54,https://collection.sciencemuseumgroup.org.uk/objects/co8247941,Q735464,True,0.972222,True
753,https://collection.sciencemuseumgroup.org.uk/objects/co8087889,Q16240986,True,0.972222,True
3216,https://collection.sciencemuseumgroup.org.uk/objects/co8014638,Q27908860,True,0.972222,True
2705,https://collection.sciencemuseumgroup.org.uk/objects/co205948,Q96748816,True,0.972222,True
1649,https://collection.sciencemuseumgroup.org.uk/objects/co205736,Q23710199,True,0.972222,True
1245,https://collection.sciencemuseumgroup.org.uk/objects/co206601,Q1011643,True,0.972222,True
2499,https://collection.sciencemuseumgroup.org.uk/objects/co207720,Q2393776,True,0.972222,True
226,https://collection.sciencemuseumgroup.org.uk/objects/co8180085,Q2123438,True,0.972222,True
866,https://collection.sciencemuseumgroup.org.uk/objects/co205775,Q11288598,True,0.966258,True
2830,https://collection.sciencemuseumgroup.org.uk/objects/co205752,Q1126984,True,0.966258,True


## 4. Export results

In [33]:
export_path = test_dir + "preds_positive.csv"
print(export_path)

/Volumes/Kalyan_SSD/SMG/disambiguation/objects_131120/test_locomotives_and_rolling_stock/preds_positive.csv


In [34]:
pairs_pred_positive_filtered.to_csv(export_path)