# Matching Symmetries

The matches are not symmetrical. If we have:
 * (a, b) and (b, a), correct
 * (a, b) and (b, c), incorrect, we remove the relations as we do not know which one is correct
 * (a, b) and no (b, a), we remove (a, b)

Pierre Nugues

In [1]:
import pandas as pd
import json
from tqdm import tqdm

In [2]:
# pd.set_option('display.max_colwidth', None)

In [3]:
"""df = pd.read_json(
    "hf://datasets/albinandersson/Nordisk-Familjebok-Headword-Classified-Matched-Linked/data.jsonl", lines=True)
df.to_json('~/nf.json')
    """

'df = pd.read_json(\n    "hf://datasets/albinandersson/Nordisk-Familjebok-Headword-Classified-Matched-Linked/data.jsonl", lines=True)\ndf.to_json(\'~/nf.json\')\n    '

In [4]:
df = pd.read_json('nf.json')

In [5]:
df.head(10)

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
0,E1_1,A,A är den första bokstafven i alla indoeuropeis...,0,E1,,,,,
1,E1_2,A,"A, Lat. prepos. Se Ab.",0,E1,,,,,
2,E1_3,Aa,"Aa ( utt. a ; Ach l. Aach, af Fornt. aha, vatt...",0,E1,,,,,
3,E1_4,Aabenraa,"Aabenraa ( Åbenrå ; på tyska Apenrade ), stad ...",1,E1,,E2_7,E3_9,,
4,E1_5,Aachen,"Aachen ( utt. ak - ; Lat. Aquisgranum, Fr. [ r...",1,E1,,E2_10,E3_13,,
5,E1_6,Aafjord,"Aafjord ( Åfjord ), stad i Norge, Trondhjems [...",1,E1,,,,,
6,E1_7,Aagesön,"Aagesön [ ågesön ], Svend ( Lat. Sveno Agonis ...",2,E1,,E2_16,,,
7,E1_8,Aahausen,"Aahausen [ a - ] l. Anhausen, by i Würtemberg....",1,E1,,,,,
8,E1_9,Aak,"Aak [ ak ], Holl., ett slags lastpråm eller fl...",0,E1,,,,,
9,E1_10,Aakirkeby,"Aakirkeby ( Åkirkeby ), köping på Bornholm. 74...",1,E1,,E2_18,E3_18,,


We establish symmetries in E1 --> E2,

In [6]:
df.iloc[1]['E2_match']

''

In [7]:
df[df['entry_id'].isin(['E1_10500', 'E2_19801'])]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
10499,E1_10500,Böszörmény,"Böszörmény [ bössörmenj ], stad i mellersta Un...",1,E1,,E2_19801,,,
137273,E2_19801,Böszörmény,"Böszörmény [ bösörmenj ], Hajdu - Böször - mén...",1,E2,,,,,


In [8]:
df.iloc[10499]['E2_match']

'E2_19801'

In [9]:
df[df['entry_id'] == 'E2_19801'].index

Index([137273], dtype='int64')

In [10]:
df.loc[df[df['entry_id'] == 'E2_19801'].index, 'E1_match'] = 'E1_10500'

In [11]:
df[df['entry_id'].isin(['E1_10500', 'E2_19801'])]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
10499,E1_10500,Böszörmény,"Böszörmény [ bössörmenj ], stad i mellersta Un...",1,E1,,E2_19801,,,
137273,E2_19801,Böszörmény,"Böszörmény [ bösörmenj ], Hajdu - Böször - mén...",1,E2,E1_10500,,,,


In [12]:
df[df['entry_id'].isin(['E2_2269', 'E1_1273', 'E1_1268'])]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
1267,E1_1268,Akyab,"Akyab, stad i Indien. Se Akjab.",1,E1,,E2_2269,,,
1272,E1_1273,Akyab,"Akyab, stad i Indien. Se Akjab.",1,E1,,E2_2269,,,
119741,E2_2269,Akyab,"Akyab, stad i Indien. Se Akjab.",1,E2,E1_1273,,,,


In [13]:
df[df['entry_id'].isin(['E4_30597', 'E4_30596', 'E4_30598', 'E3_8788'])]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
311323,E3_8788,Fällforsen,"Fällforsen, vattenfall i nedre Ume älv, 12 km ...",1,E3,,,,E4_30596,
359595,E4_30596,Fällforsen,"Fällforsen, fall i Umeälv 22 km ovanför Vindel...",1,E4,,,E3_8788,,
359596,E4_30597,Fällforsen,"Fällforsen, fall i Piteälv 14 km n. v. om Älvs...",1,E4,,,E3_8788,,
359597,E4_30598,Fällforsen,"Fällforsen, fall i Byskeälv, mitt emellan n. s...",1,E4,,,E3_8788,,


In [14]:
df.iloc[[2713, 122313]]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
2713,E1_2714,Angola,"Angola, portugisisk besittning på Afrikas vest...",1,E1,,E2_4841,,,
122313,E2_4841,Angola,"Angola [ - gå - ], portugisiskt generalguverne...",1,E2,,,,,


In [None]:
def set_symmetry(df, ed1, ed2):
    concord_m, no_m = 0, 0
    matching_ed1 = ed1 + '_match'
    matching_ed2 = ed2 + '_match'
    for i in tqdm(range(len(df))):
        if df.iloc[i]['edition'] != ed1:
            continue
        entry = df.iloc[i]['entry_id']
        matching_entry = df.iloc[i][matching_ed2]
        if matching_entry:  # We have a matching entry
            # print(i, df.iloc[i]['entry_id'], matching_entry)
            if df[(df['edition'] == ed2) &
                  (df['entry_id'] == matching_entry) &
                    (df[matching_ed1] == entry)].any().any(): # Case 1: We have a symmetry
                concord_m += 1
                # print(df[df['entry_id'] == matching_entry])
            else:  # Case 2: The relation is not symmetrical. We remove it
                no_m += 1
                df.loc[df[df['entry_id'] == entry].index,
                       matching_ed2] = ''
                # print(i, df.iloc[i]['entry_id'], matching_entry)
                # print(df[df['entry_id'] == matching_entry].index)
                # break
    return concord_m, no_m


In [16]:
set_symmetry(df, 'E1', 'E2')

100%|██████████| 418221/418221 [13:38<00:00, 510.75it/s]  


(25915, 184)

In [17]:
df[df['entry_id'].isin(['E2_2269', 'E1_1273', 'E1_1268'])]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
1267,E1_1268,Akyab,"Akyab, stad i Indien. Se Akjab.",1,E1,,,,,
1272,E1_1273,Akyab,"Akyab, stad i Indien. Se Akjab.",1,E1,,E2_2269,,,
119741,E2_2269,Akyab,"Akyab, stad i Indien. Se Akjab.",1,E2,E1_1273,,,,


In [18]:
df.iloc[[2713, 122313]]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
2713,E1_2714,Angola,"Angola, portugisisk besittning på Afrikas vest...",1,E1,,,,,
122313,E2_4841,Angola,"Angola [ - gå - ], portugisiskt generalguverne...",1,E2,,,,,


In [19]:
df.to_json('nf_sym.json')

In [20]:
print(set_symmetry(df, 'E1', 'E3'))
print(set_symmetry(df, 'E1', 'E4'))
print(set_symmetry(df, 'E2', 'E1'))
print(set_symmetry(df, 'E2', 'E3'))
print(set_symmetry(df, 'E2', 'E4'))
print(set_symmetry(df, 'E3', 'E1'))
print(set_symmetry(df, 'E3', 'E2'))
print(set_symmetry(df, 'E3', 'E4'))
print(set_symmetry(df, 'E4', 'E1'))
print(set_symmetry(df, 'E4', 'E2'))
set_symmetry(df, 'E4', 'E3')

100%|██████████| 418221/418221 [01:09<00:00, 5975.86it/s] 


(1998, 41)


100%|██████████| 418221/418221 [03:33<00:00, 1954.31it/s] 


(6555, 112)


100%|██████████| 418221/418221 [13:30<00:00, 516.11it/s]  


(25915, 208)


100%|██████████| 418221/418221 [01:47<00:00, 3873.43it/s] 


(3203, 57)


100%|██████████| 418221/418221 [04:56<00:00, 1411.10it/s] 


(9245, 144)


100%|██████████| 418221/418221 [01:07<00:00, 6188.56it/s] 


(1998, 62)


100%|██████████| 418221/418221 [01:46<00:00, 3927.12it/s] 


(3203, 83)


100%|██████████| 418221/418221 [02:10<00:00, 3210.68it/s] 


(3952, 75)


100%|██████████| 418221/418221 [03:33<00:00, 1959.01it/s] 


(6555, 157)


100%|██████████| 418221/418221 [04:54<00:00, 1419.04it/s] 


(9245, 155)


100%|██████████| 418221/418221 [02:09<00:00, 3223.68it/s] 


(3952, 43)

In [22]:
df[df['entry_id'].isin(['E4_30597', 'E4_30596', 'E4_30598', 'E3_8788'])]

Unnamed: 0,entry_id,headword,definition,type,edition,E1_match,E2_match,E3_match,E4_match,QID
311323,E3_8788,Fällforsen,"Fällforsen, vattenfall i nedre Ume älv, 12 km ...",1,E3,,,,E4_30596,
359595,E4_30596,Fällforsen,"Fällforsen, fall i Umeälv 22 km ovanför Vindel...",1,E4,,,E3_8788,,
359596,E4_30597,Fällforsen,"Fällforsen, fall i Piteälv 14 km n. v. om Älvs...",1,E4,,,,,
359597,E4_30598,Fällforsen,"Fällforsen, fall i Byskeälv, mitt emellan n. s...",1,E4,,,,,


In [21]:
df.to_json('nf_sym.json')

First failed attempt to write the symmetry program

In [None]:
"""
def set_symmetry_old(df, ed1, ed2):
    concord_m, discord_m, no_m = 0, 0, 0
    matching_ed1 = ed1 + '_match'
    matching_ed2 = ed2 + '_match'
    for i in tqdm(range(len(df))):
        if df.iloc[i]['edition'] != ed1:
            continue
        entry = df.iloc[i]['entry_id']
        matching_entry = df.iloc[i][matching_ed2]
        if matching_entry:  # We have a matching entry
            # print(i, df.iloc[i]['entry_id'], matching_entry)
            if df[(df['edition'] == ed2) &
                  (df['entry_id'] == matching_entry) &
                    (df[matching_ed1] == entry)].any().any():  # Case 1: We have a symmetry
                concord_m += 1
                # print(df[df['entry_id'] == matching_entry])
            elif df[(df['edition'] == ed2) &
                    (df['entry_id'] == matching_entry) &
                    (df[matching_ed1] == '')].any().any():  # Case 2: We have a unique relation
                no_m += 1
                df.loc[df[df['entry_id'] == matching_entry].index,
                       matching_ed1] = entry
                # print(i, df.iloc[i]['entry_id'], matching_entry)
                # print(df[df['entry_id'] == matching_entry].index)
                # break
            else:  # Case 3: We have two nonsymmetrical relations. We remove them.
                # This will create one-way relations.
                # We need to run again the loop (twp-pass procedure)
                discord_m += 1
                df.loc[df[df['entry_id'] == entry].index,
                       matching_ed2] = ''
                df.loc[df[df['entry_id'] == matching_entry].index,
                       matching_ed1] = ''
                # print(i, df.iloc[i]['entry_id'], matching_entry)
                # print(df[df['entry_id'] == matching_entry].index)
                # break
    return concord_m, discord_m, no_m
"""