# Compare annotation results

### Import required libraries

In [1]:
import pandas as pd

#### Load and standardize data

In [7]:
# Paths of input files (can be modified)
first_path_manual_annotation = "../../data/interim/annotated_articles/manual/2023/2023_manual_annotation_1.csv"
second_path_manual_annotation = "../../data/interim/annotated_articles/manual/2023/2023_manual_annotation_2.csv"
path_automatic_annotation = "../../data/interim/annotated_articles/automated_combined/by_year/2023_articles_automated_annotated.csv"

# Load the data
df_one_manual_annotation = pd.read_csv(first_path_manual_annotation, sep=';')
df_two_manual_annotation = pd.read_csv(second_path_manual_annotation, sep=';')
df_automatic_annotation = pd.read_csv(path_automatic_annotation, sep=';')

# Make sure that nor articles from bbc are included
df_one_manual_annotation = df_one_manual_annotation.loc[df_one_manual_annotation['Source'] != 'bbc']
df_manuel = df_two_manual_annotation.loc[df_two_manual_annotation['Source'] != 'bbc']

# Standardize data
df_one_manual_annotation['Category'] = df_one_manual_annotation['Category'].str.replace('&', 'and').str.lower()
df_two_manual_annotation['Category'] = df_two_manual_annotation['Category'].str.replace('&', 'and').str.lower()
df_automatic_annotation['Category'] = df_automatic_annotation['Category'].str.replace('&', 'and').str.lower()

### Get accuracy between two manual annotators

In [8]:
# Merge the dataframes on index
df_merged = pd.merge(df_one_manual_annotation, df_two_manual_annotation, left_index=True, right_index=True, suffixes=('_one', '_two'))
# Compare the 'Category' columns
matches = df_merged['Category_one'] == df_merged['Category_two']

# Calculate the accuracy
accuracy = matches.mean()

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 80.10%


### Get accuracy between manual and automatic annotation (for the ones the manual annotators agreed on)

In [9]:
# Merge df_one_manual_annotation and df_two_manual_annotation on 'Url' and filter rows where 'Category' is the same
df_manuals_merged = pd.merge(df_one_manual_annotation, df_two_manual_annotation, on='Url', suffixes=('_one', '_two'))
df_manuals_same_category = df_manuals_merged[df_manuals_merged['Category_one'] == df_manuals_merged['Category_two']]

# Merge df_manuals_same_category with df_automatic_annotation on 'Url'
df_merged = pd.merge(df_manuals_same_category, df_automatic_annotation, on='Url')

# Compare the 'Category' columns
matches = df_merged['Category_one'] == df_merged['Category']

# Calculate the accuracy
accuracy = matches.mean()

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 81.23%


### Get accuracy between manual and automatic annotation (correct if at least one manual annotator agrees)

In [10]:
# Merge df_one_manual_annotation and df_two_manual_annotation on 'Url'
df_manuals_merged = pd.merge(df_one_manual_annotation, df_two_manual_annotation, on='Url', suffixes=('_one', '_two'))

# Merge df_manuals_merged with df_automatic_annotation on 'Url'
df_merged = pd.merge(df_manuals_merged, df_automatic_annotation, on='Url')

# Create a mask where 'Category' of df_automatic_annotation matches either 'Category' of df_one_manual_annotation or df_two_manual_annotation
matches = (df_merged['Category'] == df_merged['Category_one']) | (df_merged['Category'] == df_merged['Category_two'])

# Calculate the accuracy
accuracy = matches.mean()

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 80.03%
