# Matching Symmetries

The matches are not symmetrical. If we have:
 * (a, b) and (b, a), correct
 * (a, b) and (b, c), incorrect, we remove the relations as we do not know which one is correct
 * (a, b) and no (b, a), we remove (a, b)

Pierre Nugues

In [None]:
import pandas as pd
import json
from tqdm import tqdm

In [None]:
# pd.set_option('display.max_colwidth', None)

In [None]:
"""df = pd.read_json(
    "hf://datasets/albinandersson/nf-headword-linked/data.jsonl", lines=True)
df.to_json('~/nf.json')
    """

In [None]:
df = pd.read_json('nf.json')

In [None]:
df.head(10)

We establish symmetries in E1 --> E2,

In [None]:
df.iloc[1]['E2_match']

In [None]:
df[df['entry_id'].isin(['E1_10500', 'E2_19801'])]

In [None]:
df.iloc[10499]['E2_match']

In [None]:
df[df['entry_id'] == 'E2_19801'].index

In [None]:
df.loc[df[df['entry_id'] == 'E2_19801'].index, 'E1_match'] = 'E1_10500'

In [None]:
df[df['entry_id'].isin(['E1_10500', 'E2_19801'])]

In [None]:
df[df['entry_id'].isin(['E2_2269', 'E1_1273', 'E1_1268'])]

In [None]:
df[df['entry_id'].isin(['E4_30597', 'E4_30596', 'E4_30598', 'E3_8788'])]

In [None]:
df.iloc[[2713, 122313]]

In [None]:
def set_symmetry(df, ed1, ed2):
    concord_m, no_m = 0, 0
    matching_ed1 = ed1 + '_match'
    matching_ed2 = ed2 + '_match'
    for i in tqdm(range(len(df))):
        if df.iloc[i]['edition'] != ed1:
            continue
        entry = df.iloc[i]['entry_id']
        matching_entry = df.iloc[i][matching_ed2]
        if matching_entry:  # We have a matching entry
            # print(i, df.iloc[i]['entry_id'], matching_entry)
            if df[(df['edition'] == ed2) &
                  (df['entry_id'] == matching_entry) &
                    (df[matching_ed1] == entry)].any().any(): # Case 1: We have a symmetry
                concord_m += 1
                # print(df[df['entry_id'] == matching_entry])
            else:  # Case 2: The relation is not symmetrical. We remove it
                no_m += 1
                df.loc[df[df['entry_id'] == entry].index,
                       matching_ed2] = ''
                # print(i, df.iloc[i]['entry_id'], matching_entry)
                # print(df[df['entry_id'] == matching_entry].index)
                # break
    return concord_m, no_m


In [None]:
set_symmetry(df, 'E1', 'E2')

In [None]:
df[df['entry_id'].isin(['E2_2269', 'E1_1273', 'E1_1268'])]

In [None]:
df.iloc[[2713, 122313]]

In [None]:
df.to_json('nf_sym.json')

In [None]:
print(set_symmetry(df, 'E1', 'E3'))
print(set_symmetry(df, 'E1', 'E4'))
print(set_symmetry(df, 'E2', 'E1'))
print(set_symmetry(df, 'E2', 'E3'))
print(set_symmetry(df, 'E2', 'E4'))
print(set_symmetry(df, 'E3', 'E1'))
print(set_symmetry(df, 'E3', 'E2'))
print(set_symmetry(df, 'E3', 'E4'))
print(set_symmetry(df, 'E4', 'E1'))
print(set_symmetry(df, 'E4', 'E2'))
set_symmetry(df, 'E4', 'E3')

In [None]:
df[df['entry_id'].isin(['E4_30597', 'E4_30596', 'E4_30598', 'E3_8788'])]

In [None]:
df.to_json('nf_sym.json')

First failed attempt to write the symmetry program

In [None]:
"""
def set_symmetry_old(df, ed1, ed2):
    concord_m, discord_m, no_m = 0, 0, 0
    matching_ed1 = ed1 + '_match'
    matching_ed2 = ed2 + '_match'
    for i in tqdm(range(len(df))):
        if df.iloc[i]['edition'] != ed1:
            continue
        entry = df.iloc[i]['entry_id']
        matching_entry = df.iloc[i][matching_ed2]
        if matching_entry:  # We have a matching entry
            # print(i, df.iloc[i]['entry_id'], matching_entry)
            if df[(df['edition'] == ed2) &
                  (df['entry_id'] == matching_entry) &
                    (df[matching_ed1] == entry)].any().any():  # Case 1: We have a symmetry
                concord_m += 1
                # print(df[df['entry_id'] == matching_entry])
            elif df[(df['edition'] == ed2) &
                    (df['entry_id'] == matching_entry) &
                    (df[matching_ed1] == '')].any().any():  # Case 2: We have a unique relation
                no_m += 1
                df.loc[df[df['entry_id'] == matching_entry].index,
                       matching_ed1] = entry
                # print(i, df.iloc[i]['entry_id'], matching_entry)
                # print(df[df['entry_id'] == matching_entry].index)
                # break
            else:  # Case 3: We have two nonsymmetrical relations. We remove them.
                # This will create one-way relations.
                # We need to run again the loop (twp-pass procedure)
                discord_m += 1
                df.loc[df[df['entry_id'] == entry].index,
                       matching_ed2] = ''
                df.loc[df[df['entry_id'] == matching_entry].index,
                       matching_ed1] = ''
                # print(i, df.iloc[i]['entry_id'], matching_entry)
                # print(df[df['entry_id'] == matching_entry].index)
                # break
    return concord_m, discord_m, no_m
"""