In [101]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score
from statsmodels.stats.inter_rater import fleiss_kappa
import pandas as pd
import numpy as np
import warnings
import re
import sys
import os
sys.path.append(os.path.abspath("../.."))
from tools import fleiss_pivot

from IPython.display import display_markdown

In [102]:
def to_bool(string:str) -> bool:
    string = str(string).strip().casefold()
    if string == 'nan':
        return False
    if string in ('true', 'yes', '1'):
        return True
    if string in ('false', 'no', '0'):
        return False
    
    # some random stackoverflow said not not is faster than bool()
    return not not string 

In [103]:
#os.chdir('./classify/irr') # sometimes necessary to get the right path

FileNotFoundError: [Errno 2] No such file or directory: './classify/irr'

In [114]:
IRR_FILES = ['./train_set_human/leon.csv', './train_set_human/lilli.csv']
IRR_FILES = ['./test_set_human/alex.csv', './test_set_human/caitlyn.csv']
IRR_FILES = ['./test_set_gpt/leon.csv', './test_set_gpt/decilm-instruct.csv']
IRR_FILES = ['./test_set_gpt/caitlyn.csv', './test_set_gpt/decilm.csv']
KEY_COLUMNS = ['Date','Publication','Headline',]
LABELS = set()

# Create empty key dataframe
key = pd.DataFrame(columns=KEY_COLUMNS)
raters = dict()
for path in IRR_FILES:
    df = pd.read_csv(path)
    rater = os.path.basename(path).split('.')[0]
    
     # Drop all unnamed columns
    df = df.drop(columns=[col for col in df.columns if col.startswith('Unnamed: ')])
    
    # TODO: depending on our format, we will need to cast certain columns to bool
    # For now, convert all columns to bool and assume we match companies correctly
    for col in df.columns:
        if col in KEY_COLUMNS:
            continue
        df[col] = df[col].apply(to_bool)
    
    # Save
    df.set_index(KEY_COLUMNS, inplace=True)
    raters[rater] = df

# Check what columns each DF has in common
LABELS = set(raters[list(raters.keys())[0]].columns)
for rater in raters:
    LABELS = LABELS.intersection(raters[rater].columns)

# Truncate all dataframes to have the same keys (and drop unnamed: columns)
all_keys = raters[list(raters.keys())[0]].index
for rater in raters:
    all_keys = all_keys.intersection(raters[rater].index)
for rater in raters:
    raters[rater] = raters[rater].loc[all_keys]

print(f'Found {len(all_keys)} mutual rows and {len(LABELS)} labels {LABELS}')

Found 200 mutual rows and 2 labels {'CompanyMentioned', 'BreachMentioned'}


In [115]:
# Classification report for a row that takes on two OR MORE values
# ex: "Category": 'A' or 'B' or 'C'
REPORT_COLUMN = 'CompanyMentioned'
rater1 = list(raters.keys())[0]
rater2 = list(raters.keys())[1]

print(
    classification_report(
        raters[rater1][REPORT_COLUMN],
        raters[rater2][REPORT_COLUMN],
    )
)
category_report = classification_report(
    raters[rater1][REPORT_COLUMN],
    raters[rater2][REPORT_COLUMN],
    output_dict=True
)

category_irr = fleiss_kappa(
    fleiss_pivot([
        raters[rater1],
        raters[rater2],
    ],
    REPORT_COLUMN)
)

print("Fleiss:", category_irr, sep=' ')

              precision    recall  f1-score   support

       False       0.86      0.83      0.85       143
        True       0.61      0.67      0.64        57

    accuracy                           0.79       200
   macro avg       0.74      0.75      0.74       200
weighted avg       0.79      0.79      0.79       200

Fleiss: 0.4856305511528455


In [116]:
# Create a markdown table for every label
rater1 = list(raters.keys())[0]
rater2 = list(raters.keys())[1]

f1_scores = list()
irr_scores = list()
md = '| Label | IRR | Precision | Recall | F1 | Support | In Agreement | Disagreement |\n|---|---|---|---|---|---|---|---|\n'
for label in LABELS:
    if label=='Category':
        continue
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        irr = fleiss_kappa(fleiss_pivot([
            raters[rater1],
            raters[rater2],
        ], label))
        p, r, f, s = precision_recall_fscore_support(raters[rater1][label].fillna(False), raters[rater2][label].fillna(False), average='binary')
    
    # Overwrite with support (max of both)
    s = max(raters[rater1][label].sum(), raters[rater2][label].sum())
    
    agree    = (raters[rater1][label] == raters[rater2][label]).sum()
    disagree = (raters[rater1][label] != raters[rater2][label]).sum()
    
    p, r, f, s = float(p), float(r), float(f), int(s) if s is not None else 0
    md += f'| {label} | {irr:.2f} | {p:.2f} | {r:.2f} | {f:.2f} | {s} | {agree} | {disagree} |\n'
    f1_scores.append(f)
    irr_scores.append(irr)
    
display_markdown(md, raw=True)
print('Average F1:', np.nanmean(f1_scores), sep=' ')
print('Average IRR:', np.nanmean(irr_scores), sep=' ')

| Label | IRR | Precision | Recall | F1 | Support | In Agreement | Disagreement |
|---|---|---|---|---|---|---|---|
| CompanyMentioned | 0.49 | 0.61 | 0.67 | 0.64 | 62 | 157 | 43 |
| BreachMentioned | 0.32 | 0.28 | 0.94 | 0.43 | 53 | 161 | 39 |


Average F1: 0.536719035440263
Average IRR: 0.40129419321730014


In [117]:
# Calculate overall Avergage F1 and Avergage IRR
category_f1 = category_report['macro avg']['f1-score']
category_irr = category_irr
labels_f1 = np.nanmean(f1_scores)
labels_irr = np.nanmean(irr_scores)
 
ncat = max(raters[rater1][REPORT_COLUMN].nunique(), raters[rater2][REPORT_COLUMN].nunique())

print('Category F1:', category_f1, sep=' ')
print('Category IRR:', category_irr, sep=' ')
print('Labels F1:', labels_f1, sep=' ')
print('Labels IRR:', labels_irr, sep=' ')
print('Overall F1:', (ncat * category_f1 + len(LABELS) * labels_f1) / (ncat + len(LABELS)), sep=' ')
print('Overall IRR:', (ncat * category_irr + len(LABELS) * labels_irr) / (ncat + len(LABELS)), sep=' ')

Category F1: 0.7428152755764228
Category IRR: 0.4856305511528455
Labels F1: 0.536719035440263
Labels IRR: 0.40129419321730014
Overall F1: 0.6397671555083428
Overall IRR: 0.4434623721850728


In [118]:
# Where do we disagree? Print them all out!

joined_df = raters[rater1].join(raters[rater2], lsuffix=f'_{rater1}', rsuffix=f'_{rater2}')
idxs = set()
for label in LABELS:
    if label=='Category':
        continue
    
    idxs.update(joined_df[joined_df[f'{label}_{rater1}'] != joined_df[f'{label}_{rater2}']].index)
    
# Shiw all the disagreements
disagree_df = joined_df.loc[list(idxs)].reset_index().drop(columns=['Date', 'Publication', 'URL'])

from IPython.display import HTML
HTML(disagree_df.to_html(index=False))

Headline,BreachMentioned_caitlyn,CompanyMentioned_caitlyn,BreachMentioned_decilm,CompanyMentioned_decilm,Unnamed: 5
US officials 'confident' Russians hacked DNC :0,True,True,False,False,
Officials to Probe Leak of Bin Laden Video,False,False,True,False,
Russia's Novaya Gazeta Web site hacked,False,True,True,True,
Beehive raises record-breaking $10 million,False,False,False,True,
What? DEA kept secret phone records for years,False,True,False,False,
Twitter blocks government 'spy centers' from accessing user data,False,True,True,True,
"TSA intercepted a record number of guns, plus these bizarre items, in 2014",False,True,False,False,
"Your phone number is all a hacker needs to read texts, listen to calls and track you",False,False,True,False,
"Personal data of a billion Indians sold online for £6, report claims",True,False,True,True,
The Data-Driven Life,False,False,False,True,
