# Match and analyse both annotated datasets

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize


In [2]:
# Read both annotators' references
from re import split


df_a = pd.read_csv('annotator-a.csv')
df_b = pd.read_csv('annotator-b.csv')

# Clean the text, sentence, and identifier columns
def clean(df):
    df['text'] = df['text'].apply(lambda x: x.strip().replace('\n', ' ').replace('\r', ' ')  if isinstance(x, str) else x)
    df['text'] = df['text'].apply(lambda x: ' '.join(x.split()) if isinstance(x, str) else x)
    df['sentence'] = df['sentence'].apply(lambda x: x.strip().replace('\n', ' ').replace('\r', ' ') if  isinstance(x, str) else x)
    df['sentence'] = df['sentence'].apply(lambda x: ' '.join(x.split()) if isinstance(x, str) else x)
    df['external_identifier'] = df['external_identifier'].apply(lambda x: x.strip().replace('https://zoek.officielebekendmakingen.nl/', '').replace('kst-', '') if isinstance(x, str) else x)

    return df

df_a = clean(df_a)
df_b = clean(df_b)

print(df_a.shape, df_b.shape)

(250, 15) (256, 15)


In [3]:
def panoptic_overlap_match(text1, text2):
    """
    Returns True if the overlap between text1 ad text2 satsifes:
    |A ∩ B| > 0.5|A| and |A ∩ B| > 0.5|B|
    """
    set1 = set(word_tokenize(text1))
    set2 = set(word_tokenize(text2))
    if not set1 or not set2:
        return False
    intersection = set1 & set2
    return len(intersection) > 0.5 * len(set1) and len(intersection) > 0.5 * len(set2)


In [4]:
matched_pairs = pd.DataFrame(columns=['minute_id', 
                                      'reference_type_a', 'document_type_a', 'text_a', 'sentence_a', 'identifier_a', 
                                      'reference_type_b', 'document_type_b', 'text_b', 'sentence_b', 'identifier_b',
                                    ])

df_a_by_minute =  df_a.groupby('minute_id')
for _, anno_b in df_b.iterrows():
    minute_id = anno_b['minute_id']
    df_a_by_minute_group = df_a_by_minute.get_group(minute_id)


    best_match = None
    for _, anno_a in df_a_by_minute_group.iterrows():
        # If any sentence is nan, we skip it
        if pd.isna(anno_a['sentence']) or  pd.isna(anno_b['sentence']):
            continue

        # remove any newlines or extra spaces in sentences
        anno_a['sentence'] = anno_a['sentence'].replace('\n', '').strip()
        anno_b['sentence'] = anno_b['sentence'].replace('\n', '').strip()
        anno_a['text'] = anno_a['text'].replace('\n', '').strip()
        anno_b['text'] = anno_b['text'].replace('\n', '').strip()

        sentence_match = panoptic_overlap_match(anno_a['sentence'], anno_b['sentence'])
        text_match = panoptic_overlap_match(anno_a['text'], anno_b['text'])

        # If there is a match, we add it to the matched pairs
        if sentence_match and text_match:
            matched_pairs.loc[len(matched_pairs)] = [
                minute_id,
                anno_a['reference_type'], anno_a['document_type'], anno_a['text'], anno_a['sentence'], anno_a['external_identifier'],
                anno_b['reference_type'], anno_b['document_type'], anno_b['text'], anno_b['sentence'], anno_b['external_identifier']
            ]
            break

# Save the matched pairs to a CSV file
matched_pairs.to_csv('matched_pairs.csv', index=False)
print(f'Matched pairs saved to matched_pairs.csv with {matched_pairs.shape[0]} entries.')

Matched pairs saved to matched_pairs.csv with 191 entries.


In [49]:
# Calculate Cohen's Kappa for document_type and reference_type
from sklearn.metrics import cohen_kappa_score

kappa_document_type = cohen_kappa_score(matched_pairs['document_type_a'].tolist(),  matched_pairs['document_type_b'].tolist())
kappa_reference_type = cohen_kappa_score(matched_pairs['reference_type_a'].tolist(), matched_pairs['reference_type_b'].tolist())
kappa_identifier = cohen_kappa_score(matched_pairs['identifier_a'].tolist(), matched_pairs['identifier_b'].tolist())
print(f"Cohen's Kappa for document_type: {kappa_document_type:.4f}")
print(f"Cohen's Kappa for reference_type: {kappa_reference_type:.4f}")
print(f"Cohen's Kappa for identifier: {kappa_identifier:.4f}")

Cohen's Kappa for document_type: 0.8894
Cohen's Kappa for reference_type: 0.8717
Cohen's Kappa for identifier: 0.5863
