# Analysis of matching and linking errors

We restrict the analysis to humans and to entries matches in the four editions. We compute:
 * the precision for the matches and 
 * the precision and recall for the QID. 

Pierre Nugues

In [None]:
import pandas as pd
import json
from tqdm import tqdm
from collections import Counter

In [None]:
#pd.set_option('display.max_colwidth', None)

In [None]:
"""df = pd.read_json(
    "hf://datasets/albinandersson/nf-headword-linked/data.jsonl", lines=True)
df.to_json('~/nf.json')
    """

In [None]:
# df = pd.read_json('nf.json')
df = pd.read_json('nf_sym.json')

In [None]:
df.head(10)

In [None]:
nf_dicts = df.to_dict(orient='records')

Matches in the four editions

In [None]:
df[(df['edition'] == 'E1') &
   (df['type'] == 2) &
   (df['E2_match']) &
   (df['E3_match']) &
   (df['E4_match'])]

In [None]:
df[(df['edition'] == 'E2') &
   (df['type'] == 2) &
   (df['E1_match']) &
   (df['E3_match']) &
   (df['E4_match'])]

Why is the size different?

In [None]:
l1_1 = df[(df['edition'] == 'E1') &
   (df['type'] == 2) &
   (df['E2_match']) &
   (df['E3_match']) &
   (df['E4_match'])]['entry_id'].to_list()

In [None]:
l1_2 = df[(df['edition'] == 'E2') &
   (df['type'] == 2) &
   (df['E1_match']) &
   (df['E3_match']) &
   (df['E4_match'])]['E1_match'].to_list()

In [None]:
set(l1_1) ^ set(l1_2)

We see that `E1_101368` has no match from E1 to E4 

In [None]:
df[df['entry_id'] == 'E1_101368']

Bur `E1_101368` has matches from E2 and E3 to E4 

In [None]:
df[df['E1_match'] == 'E1_101368']

which is correct

In [None]:
df[df['entry_id'] == 'E4_32593']

## Groups of Four Matches

We extract all the matches of 4 editions

In [None]:
eq_classes_dicts = df[(
    (df['edition'] == 'E1') &
    (df['type'] == 2) &
    (df['E2_match']) &
    (df['E3_match']) &
    (df['E4_match'])
) |
    (
    (df['edition'] == 'E2') &
    (df['type'] == 2) &
    (df['E1_match']) &
    (df['E3_match']) &
    (df['E4_match'])
) |
    (
    (df['edition'] == 'E3') &
    (df['type'] == 2) &
    (df['E1_match']) &
    (df['E2_match']) &
    (df['E4_match'])
) |
    (
    (df['edition'] == 'E4') &
    (df['type'] == 2) &
    (df['E1_match']) &
    (df['E2_match']) &
    (df['E3_match'])
)
][['entry_id', 'E1_match', 'E2_match', 'E3_match', 'E4_match']].to_dict(orient='records')

In [None]:
eq_classes_dicts

In [None]:
eq_classes_tuples = []
for eq_class in eq_classes_dicts:
    eq_classes_tuples += [tuple(sorted(filter(None, eq_class.values())))]

In [None]:
sum(Counter(eq_classes_tuples).values())

In [None]:
len(Counter(eq_classes_tuples)), Counter(eq_classes_tuples)

In [None]:
df[df['entry_id'].isin(['E1_34861', 'E2_59948', 'E3_19689', 'E4_41588'])]

The QID we extract here is the one of `entry_id`. See example above for _Janssen_, where we have three QIDs and one missing.

In [None]:
m_qid = []
for i, (quad, cnt) in enumerate(Counter(eq_classes_tuples).items()):
    print(i, cnt)
    for nf_dict in nf_dicts:
        if nf_dict['entry_id'] in quad:
            print('https://www.wikidata.org/wiki/' +
                  nf_dict['QID'], nf_dict['definition'])
            m_qid += [[i, cnt, nf_dict['entry_id'], 'https://www.wikidata.org/wiki/' +
                      nf_dict['QID'], nf_dict['definition']]]

In [None]:
pd.DataFrame(m_qid).to_csv('quad.csv')

## QID
We now check the matches in the four editions with a QID. This does not define equivalence classes

Matches in the four editions with a QID

In [None]:
eq_classes_qid_dicts = df[(
    (df['edition'] == 'E1') &
    (df['type'] == 2) &
    (df['E2_match']) &
    (df['E3_match']) &
    (df['E4_match']) &
    (df['QID'])
) |
    (
    (df['edition'] == 'E2') &
    (df['type'] == 2) &
    (df['E1_match']) &
    (df['E3_match']) &
    (df['E4_match']) & 
    (df['QID'])
) |
    (
    (df['edition'] == 'E3') &
    (df['type'] == 2) &
    (df['E1_match']) &
    (df['E2_match']) &
    (df['E4_match']) &
    (df['QID'])
) |
    (
    (df['edition'] == 'E4') &
    (df['type'] == 2) &
    (df['E1_match']) &
    (df['E2_match']) &
    (df['E3_match']) &
    (df['QID'])
)
][['entry_id', 'E1_match', 'E2_match', 'E3_match', 'E4_match', 'QID']].to_dict(orient='records')

In [None]:
eq_classes_qid_dicts

In [None]:
eq_classes_qid_tuples = []
for eq_class in eq_classes_qid_dicts:
    eq_classes_qid_tuples += [tuple(sorted(filter(None, eq_class.values())))]

In [None]:
len(Counter(eq_classes_qid_tuples)), Counter(eq_classes_qid_tuples)

Two examples, where the first is an equivalence class and the second has symmetrical matches that define relations outside an equivalence class

In [None]:
df[df['entry_id'].isin(['E1_39','E1_31890'])]

In [None]:
df[df['entry_id'].isin(['E2_55081', 'E3_16858', 'E4_38660'])]

In [None]:
eq_classes_qid_tuples[0]

In [None]:
#pd.set_option('display.max_colwidth', 40)

In [None]:
for i, (k, v) in enumerate(Counter(eq_classes_qid_tuples).items()):
    print(i, v)
    print(df[df['entry_id'].isin(k[:-1])][['entry_id', 'definition', 'QID']])

What does a count of one mean? The answer is: only one row had four matches

In [None]:
#pd.set_option('display.max_colwidth', 100)

In [None]:
df[df['entry_id'] == 'E4_26858']['definition']

In [None]:
df[df['entry_id'].isin(['E1_34861', 'E2_59948', 'E3_19689', 'E4_41588'])]

In [None]:
sum(Counter(eq_classes_qid_tuples).values())

Here the QID are the ones from the completed quad matches. Ideally there are four, for _Jansen_, there is only one of them. In this experiment, all the completed quad matches have the same `QID`s.

In [None]:
for tup1 in eq_classes_qid_tuples:
    m1 = tup1[:-1]
    qid1 = tup1[-1]
    for tup2 in eq_classes_qid_tuples:
        m2 = tup2[:-1]
        qid2 = tup2[-1]
        if m1 == m2 and qid1 != qid2:
            print(tup1, tup2)

Note that for _Jansen_, there is only one quad match. The other _Jansen_ matches do not all have a `QID`. See example a few cells above

In [None]:
m_qid = []
for i, (quad, cnt) in enumerate(Counter(eq_classes_qid_tuples).items()):
    print(i, cnt)
    matches = quad[:-1]
    for nf_dict in nf_dicts:
        if nf_dict['entry_id'] in matches:
            print('https://www.wikidata.org/wiki/' +
                quad[-1], nf_dict['definition'])
            m_qid += [[i, cnt, nf_dict['entry_id'], 'https://www.wikidata.org/wiki/' +
                      quad[-1], nf_dict['definition']]]

In [None]:
m_qid[:4]

In [None]:
df[df['entry_id'].isin(
    ['E1_29423', 'E2_50755', 'E3_14226', 'E4_36170', 'E3_14227'])]

In [None]:
pd.DataFrame(m_qid).to_csv('quad_qid.csv')

## Eval

Recall

In [8]:
80/514, 80/486

(0.1556420233463035, 0.1646090534979424)

Precision

In [9]:
80/101, 80/94

(0.7920792079207921, 0.851063829787234)

F1

In [10]:
2/(514/80 + 101/80), 2/(486/80 + 94/80)

(0.2601626016260163, 0.27586206896551724)

## Old

In [None]:
df.loc[(df['edition'] == 'E1') &
       (df['type'] == 2) &
       (df['E2_match']) &
       (df['E3_match']) &
       (df['E4_match']) &
       (df['QID'])]

In [None]:
df.loc[(df['edition'] == 'E2') &
       (df['type'] == 2) &
       (df['E1_match']) &
       (df['E3_match']) &
       (df['E4_match']) &
       (df['QID'])]

In [None]:
df.loc[(df['edition'] == 'E3') &
       (df['type'] == 2) &
       (df['E1_match']) &
       (df['E2_match']) &
       (df['E4_match']) &
       (df['QID'])]

In [None]:
df.loc[(df['edition'] == 'E4') &
       (df['type'] == 2) &
       (df['E1_match']) &
       (df['E2_match']) &
       (df['E3_match']) &
       (df['QID'])]

In [None]:
l1 = df.loc[(df['edition'] == 'E1') &
            (df['type'] == 2) &
            (df['E4_match']) &
            (df['E2_match']) &
            (df['E3_match']) &
            (df['QID'])]['headword'].to_list()
l1

In [None]:
l4 = df.loc[(df['edition'] == 'E4') &
            (df['type'] == 2) &
            (df['E1_match']) &
            (df['E2_match']) &
            (df['E3_match']) &
            (df['QID'])]['headword'].to_list()
l4

In [None]:
set(l1) ^ set(l4)

In [None]:
df[df['headword'] == 'Falgui√®re']

## Match analysis

In [None]:
df_inter = df[(df['edition'] == 'E1') &
              (df['type'] == 2) &
              (df['E2_match']) &
              (df['E3_match']) &
              (df['E4_match']) &
              (df['QID'])]

In [None]:
df_inter

In [None]:
df_inter = df_inter.reset_index()

In [None]:
df_inter

In [None]:
nf_dicts = df.to_dict(orient='records')

In [None]:
for i in range(len(df_inter)):
    print(i)
    matches = df_inter.loc[i, ['E2_match', 'E3_match', 'E4_match']].to_list()
    print(df_inter.loc[i, ['QID', 'definition']].to_list())
    for nf_dict in nf_dicts:
        if nf_dict['entry_id'] in matches:
            print('https://www.wikidata.org/wiki/' +
                  nf_dict['QID'], nf_dict['definition'])