# Reconciling Object & Organisation Types


In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")
import os

from heritageconnector.config import config
from heritageconnector.utils.data_transformation import transform_series_str_to_list
from heritageconnector.entity_matching.reconciler import reconciler

from tqdm import tqdm
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

  from pandas import Panel


## Organisations

In [2]:
df = pd.read_pickle("../GITIGNORE_DATA/results/filtering_people_orgs_result.pkl")
# df_people = df[df['GENDER'].isin(["M", "F"])]
df_orgs = df[df['GENDER'] == "N"]

In [3]:
org_type_col = "OCCUPATION"

In [4]:
df_orgs.loc[:, org_type_col] = df_orgs.loc[:, org_type_col].str.replace("'", "")
df_orgs.loc[:, org_type_col] = transform_series_str_to_list(df_orgs[org_type_col], separator=";")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [5]:
rec = reconciler(df_orgs, table="ORGANISATION")
rec.process_column(org_type_col,
                    multiple_vals=True, 
                    class_include=["Q43229", "Q28640"], 
                    text_similarity_thresh=95,
                    search_limit_per_item=1000,
                    field_exists_filter="claims.P279")

2020-10-06 10:17:32,831 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata qcodes for items on Elasticsearch Wikidata dump


100%|██████████| 2603/2603 [13:10<00:00,  3.29it/s]

2020-10-06 10:30:43,802 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of ['Q43229', 'Q28640']



100%|██████████| 11/11 [00:05<00:00,  1.87it/s]


In [7]:
rec.export_map_df()

  from pandas import Panel


In [6]:
rec.import_map_df("../GITIGNORE_DATA/reconciliation_ORGANISATION_20201006.csv")

In [7]:
#df_orgs[[org_type_col, "OCCUPATION_resolved"]]#.head()
rec.multiple_vals = True
df_orgs["OCCUPATION_resolved"] = rec.create_column_from_map_df("OCCUPATION")
print(f"{int((df_orgs['OCCUPATION_resolved'].apply(len) > 0).sum() / len(df_orgs) * 100)}% of records have at least one resolved type")

100%|██████████| 7743/7743 [00:03<00:00, 2555.93it/s]

32% of records have at least one resolved type



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
df_orgs[["OCCUPATION", "OCCUPATION_resolved"]]

Unnamed: 0,OCCUPATION,OCCUPATION_resolved
0,[manufacturer of mathematical instruments],[]
6,[railway board],[]
8,[supplier],[Q7644488]
12,[training establishment],[]
14,[manufacturer of electrical equipment],[]
...,...,...
18053,[hospital],"[Q180370, Q16917]"
18061,[],[]
18067,"[designer, manufacturer]",[]
18068,[manufacturer],[]


In [10]:
# df_orgs.to_pickle("../GITIGNORE_DATA/organisations_with_types.pkl")

## Objects

In [11]:
objects = pd.read_csv("../GITIGNORE_DATA/smg-datasets-private/mimsy-catalogue-export.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
objects.loc[:, "ITEM_NAME"] = transform_series_str_to_list(objects["ITEM_NAME"], separator=";")

In [4]:
rec_o = reconciler(objects, table="OBJECT")
# physical object
rec_o.process_column("ITEM_NAME", 
                     multiple_vals=True, 
                     class_include="Q223557", 
                     class_exclude=["Q5", "Q43229", "Q28640"], 
                     text_similarity_thresh=90,
                     search_limit_per_item=1000,
                     field_exists_filter="claims.P279")

2020-10-06 11:37:17,009 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata qcodes for items on Elasticsearch Wikidata dump


100%|██████████| 23600/23600 [1:29:23<00:00,  4.40it/s]  


2020-10-06 13:06:40,519 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of Q223557


100%|██████████| 118/118 [02:24<00:00,  1.22s/it]


In [5]:
rec_o.export_map_df()

2020-10-06 13:09:05,126 - heritageconnector.entity_matching.reconciler - INFO - Dataframe of value to entity mappings exported to ../GITIGNORE_DATA/reconciliation_OBJECT_20201006-1309.csv
2020-10-06 13:09:05,126 - heritageconnector.entity_matching.reconciler - INFO - Dataframe of value to entity mappings exported to ../GITIGNORE_DATA/reconciliation_OBJECT_20201006-1309.csv


In [15]:
rec_o.multiple_vals = True # hacky
rec_o.import_map_df("../GITIGNORE_DATA/reconciliation_OBJECT_20201006-1309.csv")
objects["ITEM_NAME_resolved"] = rec_o.create_column_from_map_df("ITEM_NAME")

100%|██████████| 282259/282259 [01:37<00:00, 2894.75it/s]


In [16]:
print(f"{int((objects['ITEM_NAME_resolved'].apply(len) > 0).sum() / len(objects) * 100)}% of records have at least one resolved type")

51% of records have at least one resolved type


In [17]:
# objects.to_pickle("../GITIGNORE_DATA/objects_with_types.pkl")

## Test data

In [13]:
data = pd.DataFrame.from_dict({"item_name": ["photograph", "camera", "model", "bottle", "bottles"]})

r = reconciler(data, table="OBJECT")
r.process_column("item_name",
                 multiple_vals=False, 
                 class_include="Q223557", 
                 class_exclude=["Q5", "Q43229", "Q28640"], 
                 text_similarity_thresh=90,
                 search_limit_per_item=1000,
                 field_exists_filter="claims.P279")

r.create_column_from_map_df("item_name")

2020-10-06 15:13:31,286 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata qcodes for items on Elasticsearch Wikidata dump
2020-10-06 15:13:31,286 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata qcodes for items on Elasticsearch Wikidata dump
2020-10-06 15:13:31,286 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata qcodes for items on Elasticsearch Wikidata dump


100%|██████████| 5/5 [00:00<00:00,  6.98it/s]

2020-10-06 15:13:32,008 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of Q223557
2020-10-06 15:13:32,008 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of Q223557
2020-10-06 15:13:32,008 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of Q223557



100%|██████████| 1/1 [00:00<00:00,  1.29it/s]




100%|██████████| 5/5 [00:00<00:00, 4468.68it/s]


0              [Q125191]
1    [Q15328, Q97301845]
2            [Q57312861]
3               [Q80228]
4               [Q80228]
Name: item_name, dtype: object