In [None]:
import sys
sys.path.append("..") 

## How did we construct the ground truth? 
![Ground truth](../resources/ground_truth_data_setup.png)
[edit image](https://app.diagrams.net/#G1_CWj_Ux45Q6IAAnGwVurQxSnTkOvlVG_)

# Build Ground Truth

### Step-1: NMVW Data Dump

In [None]:
from nmvwdatadump.data_dump import run as dump, count_all_constituents, count_all_constituent_with_wikidata

dump("ccrdfconst", 58000, range=20)
count_all_constituents("ccrdfconst") 
count_all_constituent_with_wikidata("ccrdfconst") 

### Step-2: Filter wikidata constituents

In [None]:
from nmvwdatadump.filter_wiki_human import run as filter_wikidata, count_total_wikidata

filter_wikidata(directory="nmvw_data/ccrdfconst") # pass the FOLDER NAME containing ttl file
count_total_wikidata("nmvw_data/ccrdfconst/const_wiki_filter_log.csv")

### Step-3: Build ground truth

We have 6165 instance of human entities, where we know the corresponding wikidata URI, therefore consider those human entities as our ground truth. From federated query to wikidata SPARQL endpoint, we also retrieved all the naming variation(different language, spelling and format) for corresponding to 6178 human listed on wikidata in different language.
```SPARQL
select * where {
    SERVICE <https://query.wikidata.org/sparql> {  
         <"""+var+"""> rdfs:label ?name .  
    }}
```

In [None]:
from utils.ttl_to_dataframe import run as ttl2dataframe

ttl2dataframe("data/wikidata_ccrdfconstQ5_full.ttl", "data/wikidata_human_name.pkl")

## Naive String Matching

In [2]:
from naive.naive_string_matching import run as naive_string_matching
from result import result
import pandas

naive_string_matching("data/wikidata_human_name.pkl", "results/naive_string_matching.pkl")
result("results/naive_string_matching.pkl")

100%|█████████████████████████████████████| 6178/6178 [1:15:05<00:00,  1.37it/s]


Total query: 6178 
Correct correspondence count: 3226
Recall: 0.5221754613143412


Total query: 6178
Total retrieved: 4590
Correct correspondence count: 3226 
Precision: 0.7028322440087146


F-measure: 0.599182763744428



(6178, 4590, 3226, 0.5221754613143412, 0.7028322440087146, 0.599182763744428)

In [3]:
# show results
df = pandas.read_pickle('results/naive_string_matching.pkl')
print(df[df['retrieved_uri'].apply(lambda x: len(x)) > 0])

               name_label                                      retrieved_uri  \
0     Gabriel Clark-Brown         [http://www.wikidata.org/entity/Q82570739]   
11          P.L. Dronkers         [http://www.wikidata.org/entity/Q82571047]   
14         H.J. van Erpen         [http://www.wikidata.org/entity/Q86735056]   
23            W. Galjaard         [http://www.wikidata.org/entity/Q86735059]   
25         H. Groeneveldt  [http://www.wikidata.org/entity/Q36573311, htt...   
...                   ...                                                ...   
6167        Wilhelm Ivens          [http://www.wikidata.org/entity/Q2097087]   
6170        Sam Schellink         [http://www.wikidata.org/entity/Q27526154]   
6172        Jean Gauchard         [http://www.wikidata.org/entity/Q19336668]   
6174  Charles Ralph Boxer           [http://www.wikidata.org/entity/Q954203]   
6175       Dolf Breetvelt         [http://www.wikidata.org/entity/Q13573764]   

                                      w