# Reconciling Object & Organisation Types


In [13]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")
import os

from heritageconnector.config import config
from heritageconnector.utils.data_transformation import transform_series_str_to_list
from heritageconnector.entity_matching.reconciler import Reconciler, export_map_df_to_csv, import_map_df_from_csv, create_column_from_map_df

from tqdm import tqdm
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Organisations

In [2]:
df = pd.read_pickle("../GITIGNORE_DATA/results/filtering_people_orgs_result.pkl")
# df_people = df[df['GENDER'].isin(["M", "F"])]
df_orgs = df[df['GENDER'] == "N"]

In [3]:
org_type_col = "OCCUPATION"

In [4]:
df_orgs.loc[:, org_type_col] = df_orgs.loc[:, org_type_col].str.replace("'", "")
df_orgs.loc[:, org_type_col] = transform_series_str_to_list(df_orgs[org_type_col], separator=";")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [21]:
rec = Reconciler()
map_df = rec.process_column(
                            df_orgs[org_type_col],
                            multiple_vals=True, 
                            class_include=["Q43229", "Q28640"], 
                            search_args={
                                "search_limit_per_item":1000,
                                "field_exists_filter": "claims.P279",
                                "text_similarity_thresh": 90,
                            },
                        )

2021-05-13 17:01:55,265 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata QIDs against Elasticsearch Wikidata dump
2021-05-13 17:01:55,265 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata QIDs against Elasticsearch Wikidata dump


100%|██████████| 92/92 [03:21<00:00,  2.20s/it]

2021-05-13 17:05:17,268 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of ['Q43229', 'Q28640']
2021-05-13 17:05:17,268 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of ['Q43229', 'Q28640']



100%|██████████| 2/2 [00:00<00:00,  2.33it/s]


In [22]:
map_df.head()

Unnamed: 0,count,qids,filtered_qids
manufacturer of mathematical instruments,1,[],[]
railway board,1,[],[]
supplier,1,[Q7644488],[]
training establishment,1,[],[]
manufacturer of electrical equipment,4,[],[]


In [21]:
export_map_df_to_csv(map_df, "./temp_map_df.csv")

2021-05-13 16:19:51,947 - heritageconnector.entity_matching.reconciler - INFO - Dataframe of value to entity mappings exported to ./temp_map_df.csv
2021-05-13 16:19:51,947 - heritageconnector.entity_matching.reconciler - INFO - Dataframe of value to entity mappings exported to ./temp_map_df.csv


In [11]:
imported_map_df = import_map_df_from_csv("../GITIGNORE_DATA/reconciliation_ORGANISATION_20201006.csv")

In [15]:
#df_orgs[[org_type_col, "OCCUPATION_resolved"]]#.head()
rec.multiple_vals = True
df_orgs["OCCUPATION_resolved"] = create_column_from_map_df(df_orgs["OCCUPATION"], imported_map_df, multiple_vals=True)
print(f"{int((df_orgs['OCCUPATION_resolved'].apply(len) > 0).sum() / len(df_orgs) * 100)}% of records have at least one resolved type")

100%|██████████| 7743/7743 [00:01<00:00, 5415.85it/s]

32% of records have at least one resolved type



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_orgs["OCCUPATION_resolved"] = create_column_from_map_df(df_orgs["OCCUPATION"], imported_map_df, multiple_vals=True)


In [9]:
df_orgs[["OCCUPATION", "OCCUPATION_resolved"]]

Unnamed: 0,OCCUPATION,OCCUPATION_resolved
0,[manufacturer of mathematical instruments],[]
6,[railway board],[]
8,[supplier],[Q7644488]
12,[training establishment],[]
14,[manufacturer of electrical equipment],[]
...,...,...
18053,[hospital],"[Q180370, Q16917]"
18061,[],[]
18067,"[designer, manufacturer]",[]
18068,[manufacturer],[]


In [10]:
# df_orgs.to_pickle("../GITIGNORE_DATA/organisations_with_types.pkl")

## Objects

In [25]:
objects = pd.read_csv("../GITIGNORE_DATA/smg-datasets-private/mimsy-catalogue-export.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [26]:
objects.loc[:, "ITEM_NAME"] = transform_series_str_to_list(objects["ITEM_NAME"], separator=";")

In [27]:
# physical object
map_df = rec.process_column(objects["ITEM_NAME"].head(10),
                     multiple_vals=True, 
                     class_include="Q223557", 
                     class_exclude=["Q5", "Q43229", "Q28640"],
                     search_args={
                                "search_limit_per_item":1000,
                                "field_exists_filter": "claims.P279",
                                "text_similarity_thresh": 90,
                            },
                    )

2021-05-13 17:14:58,362 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata QIDs against Elasticsearch Wikidata dump
2021-05-13 17:14:58,362 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata QIDs against Elasticsearch Wikidata dump
2021-05-13 17:14:58,362 - heritageconnector.entity_matching.reconciler - INFO - Looking up Wikidata QIDs against Elasticsearch Wikidata dump


100%|██████████| 14/14 [00:39<00:00,  2.85s/it]

2021-05-13 17:15:38,269 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of Q223557
2021-05-13 17:15:38,269 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of Q223557
2021-05-13 17:15:38,269 - heritageconnector.entity_matching.reconciler - INFO - Filtering to values in subclass tree of Q223557



100%|██████████| 1/1 [00:00<00:00,  1.03it/s]


In [5]:
export_map_df_to_csv(map_df, "./temp_map_df.csv")

2020-10-06 13:09:05,126 - heritageconnector.entity_matching.reconciler - INFO - Dataframe of value to entity mappings exported to ../GITIGNORE_DATA/reconciliation_OBJECT_20201006-1309.csv
2020-10-06 13:09:05,126 - heritageconnector.entity_matching.reconciler - INFO - Dataframe of value to entity mappings exported to ../GITIGNORE_DATA/reconciliation_OBJECT_20201006-1309.csv


In [31]:
map_df_imported = import_map_df_from_csv("../GITIGNORE_DATA/reconciliation_OBJECT_20201006-1309.csv")
objects["ITEM_NAME_resolved"] = create_column_from_map_df(objects["ITEM_NAME"], map_df_imported, multiple_vals=True)

100%|██████████| 282259/282259 [00:48<00:00, 5867.15it/s]


In [32]:
print(f"{int((objects['ITEM_NAME_resolved'].apply(len) > 0).sum() / len(objects) * 100)}% of records have at least one resolved type")

51% of records have at least one resolved type


In [17]:
# objects.to_pickle("../GITIGNORE_DATA/objects_with_types.pkl")