# Config-Driven Search
We want the `search` step of the disambiguation driven by a SPARQL query which is automatically generated given the collection schema.

In [34]:
import sys
sys.path.append('../..')

from heritageconnector.config import config
from heritageconnector.utils.sparql import get_sparql_results

import os

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

## 1. import data

In [100]:
df = pd.read_csv(os.path.join("..", config.MIMSY_PEOPLE_PATH))
df['FREETEXT'] = df['DESCRIPTION'].astype(str) + " " + (df['NOTE']).astype(str)

for col in ['FIRSTMID_NAME', 'LASTSUFF_NAME']:
    df[col] = df[col].fillna("").astype(str)

print(df.columns)
df.head(2)

Index(['LINK_ID', 'PREFERRED_NAME', 'TITLE_NAME', 'FIRSTMID_NAME',
       'LASTSUFF_NAME', 'SUFFIX_NAME', 'HONORARY_SUFFIX', 'GENDER',
       'BRIEF_BIO', 'DESCRIPTION', 'NOTE', 'BIRTH_DATE', 'BIRTH_PLACE',
       'DEATH_DATE', 'DEATH_PLACE', 'CAUSE_OF_DEATH', 'NATIONALITY',
       'OCCUPATION', 'WEBSITE', 'AFFILIATION', 'LINGUISTIC_GROUP', 'TYPE',
       'REFERENCE_NUMBER', 'SOURCE', 'CREATE_DATE', 'UPDATE_DATE', 'FREETEXT'],
      dtype='object')


Unnamed: 0,LINK_ID,PREFERRED_NAME,TITLE_NAME,FIRSTMID_NAME,LASTSUFF_NAME,SUFFIX_NAME,HONORARY_SUFFIX,GENDER,BRIEF_BIO,DESCRIPTION,NOTE,BIRTH_DATE,BIRTH_PLACE,DEATH_DATE,DEATH_PLACE,CAUSE_OF_DEATH,NATIONALITY,OCCUPATION,WEBSITE,AFFILIATION,LINGUISTIC_GROUP,TYPE,REFERENCE_NUMBER,SOURCE,CREATE_DATE,UPDATE_DATE,FREETEXT
0,10243,Brooklyn Arms Company,,,Brooklyn Arms Company,,,N,Y,,object record: 1987-1020,c. 1870,"Brooklyn, New York, New York state, United States",,,,American,manufacturer of mathematical instruments,,,,,,N,28-JAN-98,06-NOV-18,nan object record: 1987-1020
1,10245,"Zenthon, Edward Rupert",,Edward Rupert,Zenthon,,,M,Y,REF: http://www.iwm.org.uk/collections/item/object/1030031461,,1920-07,"London, Greater London, England, United Kingdom",c. 2002,,,British,engineer,,,,,,N,28-JAN-98,05-AUG-15,REF: http://www.iwm.org.uk/collections/item/object/1030031461 nan


In [109]:
people_df = df[df['GENDER'].isin(['M','F'])].copy()
org_df = df[df['GENDER'] == 'N'].copy()

people_df.loc[:, 'TYPE'] = 'Q5'
org_df.loc[:, 'TYPE'] = 'Q43229'

people_df.loc[:, 'GENDER_QID'] = people_df.loc[:, 'GENDER'].map({"M": "Q6581097", "F": "Q6581072"})
people_df.loc[:, 'JOINED_NAME'] = people_df['FIRSTMID_NAME'] + " " + people_df['LASTSUFF_NAME']

## 2. build query driven from schema
- label col used in EntitySearch
- type & categorical cols used as filters on results (as we already have values for these)
- other cols used as filters to narrow down list of entity candidates. Scoring function depends on their type

**This query should get a list of entities matching the search term and filtered by any known QIDs, as well as all the other 
properties specified.**

In [None]:
sum(w * score(col))

In [117]:
# people
cols_use = ['JOINED_NAME',
'FIRSTMID_NAME',
'LASTSUFF_NAME',
'GENDER',
'FREETEXT',
'BIRTH_DATE',
'BIRTH_PLACE',
'DEATH_DATE',
'DEATH_PLACE',
'NATIONALITY',
'OCCUPATION']

# to query on
label_col = 'JOINED_NAME' # WD label/alias
type_col = 'TYPE' # instance or subclass of
subclass_of = False

# to return
desc_cols = ['FREETEXT'] # WD description
str_cols = {'FIRSTMID_NAME': 'P735', 'LASTSUFF_NAME': 'P734'} # short str for Levenshtein matching
date_cols = {'BIRTH_DATE': 'P569', 'DEATH_DATE': 'P570'} # to be converted to date (should have already been converted)
place_cols = {'BIRTH_PLACE': 'P19', 'DEATH_PLACE': 'P20'} # could be treated as location or string
cat_cols = {'GENDER': 'P21', 'NATIONALITY': 'P27', 'OCCUPATION': 'P106'} # to be treated as qcode or string
return_dict = {**str_cols, **date_cols, **place_cols, **cat_cols}

config_dict = {}
config_dict['label'] = label_col
config_dict['type'] = type_col
config_dict['subclass_flag'] = subclass_of
config_dict['return'] = return_dict

# functions to generate query subsets
def generate_return_expressions(return_dict):
    select_slug = "?" + " ?".join(return_dict.keys())
    body_exp = "\n".join([f"OPTIONAL{{ ?item wdt:{v} ?{k} .}}" for k,v in return_dict.items()])
    
    return select_slug, body_exp

generate_return_expressions(return_dict)

('?FIRSTMID_NAME ?LASTSUFF_NAME ?BIRTH_DATE ?DEATH_DATE ?BIRTH_PLACE ?DEATH_PLACE ?GENDER ?NATIONALITY ?OCCUPATION',
 'OPTIONAL{ ?item wdt:P735 ?FIRSTMID_NAME .}\nOPTIONAL{ ?item wdt:P734 ?LASTSUFF_NAME .}\nOPTIONAL{ ?item wdt:P569 ?BIRTH_DATE .}\nOPTIONAL{ ?item wdt:P570 ?DEATH_DATE .}\nOPTIONAL{ ?item wdt:P19 ?BIRTH_PLACE .}\nOPTIONAL{ ?item wdt:P20 ?DEATH_PLACE .}\nOPTIONAL{ ?item wdt:P21 ?GENDER .}\nOPTIONAL{ ?item wdt:P27 ?NATIONALITY .}\nOPTIONAL{ ?item wdt:P106 ?OCCUPATION .}')

In [128]:
# TODO: figure out a better way of doing string search - CONTAINS / remove punctuation?

def run_query(row, config_dict):
    endpoint_url = "https://query.wikidata.org/sparql"
    
    label_val = row[config_dict['label']].values[0]
    type_val = row[config_dict['type']].values[0]
    
    return_select, return_body = generate_return_expressions(config_dict['return'])
    
    class_tree = "/wdt:P279*" if config_dict['subclass_flag'] else ""
    sparq_instanceof = f"?item wdt:P31{class_tree} wd:{type_val}."
    
    query = f"""
    SELECT ?item ?itemLabel ?itemDescription ?altLabel {return_select}
        WHERE
        {{
            SERVICE wikibase:mwapi {{
                bd:serviceParam wikibase:api "EntitySearch" .
                bd:serviceParam wikibase:endpoint "www.wikidata.org" .
                bd:serviceParam mwapi:search "{label_val}" .
                bd:serviceParam mwapi:language "en" .
                ?item wikibase:apiOutputItem mwapi:item .
                ?num wikibase:apiOrdinal true .
              }}
            {sparq_instanceof}
            {return_body}
            
            OPTIONAL {{
                ?item skos:altLabel ?altLabel .
                FILTER (lang(?altLabel) = "en")
                }}

            SERVICE wikibase:label {{ 
            bd:serviceParam wikibase:language "en" .
            }}
        }}
            """
    
    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    res_df = pd.json_normalize(res)
    res_df = res_df[[col for col in res_df.columns if "value" in col]]
    return res_df

In [129]:
row = people_df[people_df['LINK_ID'] == 116957]
row

Unnamed: 0,LINK_ID,PREFERRED_NAME,TITLE_NAME,FIRSTMID_NAME,LASTSUFF_NAME,SUFFIX_NAME,HONORARY_SUFFIX,GENDER,BRIEF_BIO,DESCRIPTION,NOTE,BIRTH_DATE,BIRTH_PLACE,DEATH_DATE,DEATH_PLACE,CAUSE_OF_DEATH,NATIONALITY,OCCUPATION,WEBSITE,AFFILIATION,LINGUISTIC_GROUP,TYPE,REFERENCE_NUMBER,SOURCE,CREATE_DATE,UPDATE_DATE,FREETEXT,GENDER_QID,JOINED_NAME
10013,116957,"Draper, John",,John,Draper,,,M,Y,[http://encarta.msn.com/encyclopedia_762511344/Draper_John_William.html]; Wikipedia,researched in photochemistry,1811-05-05,"St. Helens, St. Helens, Merseyside, England, United Kingdom",1882-01-04,"Hastings, New York, United States",,English; American,chemist; historian,,,,Q5,,N,19-FEB-07,20-AUG-14,[http://encarta.msn.com/encyclopedia_762511344/Draper_John_William.html]; Wikipedia researched in photochemistry,Q6581097,John Draper


In [130]:
run_query(row, config_dict)

Unnamed: 0,item.value,altLabel.value,FIRSTMID_NAME.value,LASTSUFF_NAME.value,BIRTH_DATE.value,BIRTH_PLACE.value,GENDER.value,NATIONALITY.value,OCCUPATION.value,itemLabel.value,itemDescription.value,DEATH_DATE.value
0,http://www.wikidata.org/entity/Q712546,Crunch,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q82594,John Draper,American computer programmer and former phone phreak,
1,http://www.wikidata.org/entity/Q712546,Captain Crunch,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q82594,John Draper,American computer programmer and former phone phreak,
2,http://www.wikidata.org/entity/Q712546,Crunchman,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q82594,John Draper,American computer programmer and former phone phreak,
3,http://www.wikidata.org/entity/Q712546,John Thomas Draper,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q82594,John Draper,American computer programmer and former phone phreak,
4,http://www.wikidata.org/entity/Q712546,Crunch,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q1487,John Draper,American computer programmer and former phone phreak,
5,http://www.wikidata.org/entity/Q712546,Captain Crunch,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q1487,John Draper,American computer programmer and former phone phreak,
6,http://www.wikidata.org/entity/Q712546,Crunchman,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q1487,John Draper,American computer programmer and former phone phreak,
7,http://www.wikidata.org/entity/Q712546,John Thomas Draper,http://www.wikidata.org/entity/Q4925477,http://www.wikidata.org/entity/Q16869665,1943-03-11T00:00:00Z,http://www.wikidata.org/entity/Q13147795,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q30,http://www.wikidata.org/entity/Q1487,John Draper,American computer programmer and former phone phreak,
8,http://www.wikidata.org/entity/Q6230091,,http://www.wikidata.org/entity/Q4925477,,1890-08-26T00:00:00Z,,http://www.wikidata.org/entity/Q6581097,,http://www.wikidata.org/entity/Q43845,John Draper Perrin,Canadian business executive,1967-09-19T00:00:00Z
9,http://www.wikidata.org/entity/Q18819516,,http://www.wikidata.org/entity/Q4925477,,,,http://www.wikidata.org/entity/Q6581097,http://www.wikidata.org/entity/Q179876,http://www.wikidata.org/entity/Q2259532,John Draper,Augustinian canon and bishop-suffragan of Winchester,1552-01-01T00:00:00Z
