In [2]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score
from statsmodels.stats.inter_rater import fleiss_kappa
import pandas as pd
import numpy as np
import warnings
import re
import sys
import os
sys.path.append(os.path.abspath("../.."))
from tools import fleiss_pivot

from IPython.display import display_markdown

In [3]:
def to_bool(string:str) -> bool:
    string = str(string).strip().casefold()
    if string == 'nan':
        return False
    if string in ('true', 'yes', '1'):
        return True
    if string in ('false', 'no', '0'):
        return False
    
    # some random stackoverflow said not not is faster than bool()
    return not not string 

In [4]:
os.chdir('./classify/irr') # sometimes necessary to get the right path

FileNotFoundError: [Errno 2] No such file or directory: './classify/irr'

In [5]:
IRR_FILES = ['./train_set_human/leon.csv', './train_set_human/lilli.csv']
IRR_FILES = ['./test_set_human/alex.csv', './test_set_human/caitlyn.csv']
IRR_FILES = ['./test_set_gpt/caitlyn.csv', './test_set_gpt/decilm.csv']
KEY_COLUMNS = ['Date','Publication','Headline','URL']
LABELS = set()

# Create empty key dataframe
key = pd.DataFrame(columns=KEY_COLUMNS)
raters = dict()
for path in IRR_FILES:
    df = pd.read_csv(path)
    rater = os.path.basename(path).split('.')[0]
    
     # Drop all unnamed columns
    df = df.drop(columns=[col for col in df.columns if col.startswith('Unnamed: ')])
    
    # TODO: depending on our format, we will need to cast certain columns to bool
    # For now, convert all columns to bool and assume we match companies correctly
    for col in df.columns:
        if col in KEY_COLUMNS:
            continue
        df[col] = df[col].apply(to_bool)
        LABELS.add(col)
    
    # Save
    df.set_index(KEY_COLUMNS, inplace=True)
    raters[rater] = df

# Truncate all dataframes to have the same keys (and drop unnamed: columns)
all_keys = raters[list(raters.keys())[0]].index
for rater in raters:
    all_keys = all_keys.intersection(raters[rater].index)
for rater in raters:
    raters[rater] = raters[rater].loc[all_keys]

In [6]:
raters[rater]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,BreachMentioned,CompanyMentioned
Date,Publication,Headline,URL,Unnamed: 4_level_1,Unnamed: 5_level_1
20131115,Washington Post,Facebook makes changes to its data use policies,http://www.washingtonpost.com/business/technology/facebook-makes-changes-to-its-data-use-policies/2013/11/15/0107eab4-4e2f-11e3-be6b-d3d28122e6d4_story.html?tid=hpModule_1728cf4a-8a79-11e2-98d9-3012c1cd8d1e,False,True
20150504,Daily Mail,The 'alien sounds' captured 22 MILES above Earth: Nasa recording from the edge of space has unexplained hisses and whistles,http://www.dailymail.co.uk/sciencetech/article-3067819/The-alien-sounds-captured-22-MILES-Earth-Nasa-recording-edge-space-unexplained-hisses-whistles.html,False,False
20150219,The Guardian,Oscars television broadcast set to score record ad revenues,http://www.theguardian.com/film/2015/feb/19/oscars-television-broadcast-set-to-score-record-ad-revenues,False,False
20170104,FOX,FAKE CYBER WAR'? 'Guccifer' casts doubt on White House's Russian hacking claims,http://www.foxnews.com/politics/2017/01/04/guccifer-casts-doubt-on-obama-administrations-russia-hacking-claims.html,False,True
20210827,New York Times,Zeynep TufekciShow Me the Data!,https://www.nytimes.com/2021/08/27/opinion/covid-data-vaccines.html,False,False
...,...,...,...,...,...
20130821,New York Times,Manning Sentenced to 35 Years for Leaking Government Secrets,http://www.nytimes.com/2013/08/22/us/manning-sentenced-for-leaking-government-secrets.html?hpw,False,False
20150101,New York Times,Murders Drop to a Record Low in New York City,http://www.nytimes.com/2015/01/01/nyregion/new-york-city-murders-fall-but-the-police-arent-celebrating.html,False,False
20221116,FOX,"Several people found dead in AZ home, gas leak likely cause of death",//web.archive.orghttps://www.foxnews.com/us/several-people-found-dead-az-home-gas-leak-likely-cause-death,False,False
20210705,CNBC,China's tech crackdown has a new battleground — data,https://www.cnbc.com/2021/07/05/china-tech-crackdown-focuses-on-data-after-didi-probe-.html,False,False


In [7]:
# Classification report for a row that takes on two OR MORE values
# ex: "Category": 'A' or 'B' or 'C'
REPORT_COLUMN = 'CompanyMentioned'
rater1 = list(raters.keys())[0]
rater2 = list(raters.keys())[1]

print(
    classification_report(
        raters[rater1][REPORT_COLUMN],
        raters[rater2][REPORT_COLUMN],
    )
)
category_report = classification_report(
    raters[rater1][REPORT_COLUMN],
    raters[rater2][REPORT_COLUMN],
    output_dict=True
)

category_irr = fleiss_kappa(
    fleiss_pivot([
        raters[rater1],
        raters[rater2],
    ],
    REPORT_COLUMN)
)

print("Fleiss:", category_irr, sep=' ')

              precision    recall  f1-score   support

       False       0.78      0.63      0.69       143
        True       0.37      0.54      0.44        57

    accuracy                           0.60       200
   macro avg       0.57      0.59      0.57       200
weighted avg       0.66      0.60      0.62       200

Fleiss: 0.13469700703743262


In [8]:
# Create a markdown table for every label
rater1 = list(raters.keys())[0]
rater2 = list(raters.keys())[1]

f1_scores = list()
irr_scores = list()
md = '| Label | IRR | Precision | Recall | F1 | Support | In Agreement | Disagreement |\n|---|---|---|---|---|---|---|---|\n'
for label in LABELS:
    if label=='Category':
        continue
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        irr = fleiss_kappa(fleiss_pivot([
            raters[rater1],
            raters[rater2],
        ], label))
        p, r, f, s = precision_recall_fscore_support(raters[rater1][label].fillna(False), raters[rater2][label].fillna(False), average='binary')
    
    # Overwrite with support (max of both)
    s = max(raters[rater1][label].sum(), raters[rater2][label].sum())
    
    agree    = (raters[rater1][label] == raters[rater2][label]).sum()
    disagree = (raters[rater1][label] != raters[rater2][label]).sum()
    
    p, r, f, s = float(p), float(r), float(f), int(s) if s is not None else 0
    md += f'| {label} | {irr:.2f} | {p:.2f} | {r:.2f} | {f:.2f} | {s} | {agree} | {disagree} |\n'
    f1_scores.append(f)
    irr_scores.append(irr)
    
display_markdown(md, raw=True)
print('Average F1:', np.nanmean(f1_scores), sep=' ')
print('Average IRR:', np.nanmean(irr_scores), sep=' ')

| Label | IRR | Precision | Recall | F1 | Support | In Agreement | Disagreement |
|---|---|---|---|---|---|---|---|
| BreachMentioned | 0.59 | 0.69 | 0.56 | 0.62 | 16 | 189 | 11 |
| CompanyMentioned | 0.13 | 0.37 | 0.54 | 0.44 | 84 | 121 | 79 |


Average F1: 0.5302029836145757
Average IRR: 0.3628685332612574


In [9]:
# Calculate overall Avergage F1 and Avergage IRR
category_f1 = category_report['macro avg']['f1-score']
category_irr = category_irr
labels_f1 = np.nanmean(f1_scores)
labels_irr = np.nanmean(irr_scores)
 
ncat = max(raters[rater1][REPORT_COLUMN].nunique(), raters[rater2][REPORT_COLUMN].nunique())

print('Category F1:', category_f1, sep=' ')
print('Category IRR:', category_irr, sep=' ')
print('Labels F1:', labels_f1, sep=' ')
print('Labels IRR:', labels_irr, sep=' ')
print('Overall F1:', (ncat * category_f1 + len(LABELS) * labels_f1) / (ncat + len(LABELS)), sep=' ')
print('Overall IRR:', (ncat * category_irr + len(LABELS) * labels_irr) / (ncat + len(LABELS)), sep=' ')

Category F1: 0.5673485035187164
Category IRR: 0.13469700703743262
Labels F1: 0.5302029836145757
Labels IRR: 0.3628685332612574
Overall F1: 0.548775743566646
Overall IRR: 0.24878277014934502


In [10]:
# Where do we disagree? Print them all out!

joined_df = raters[rater1].join(raters[rater2], lsuffix=f'_{rater1}', rsuffix=f'_{rater2}')
idxs = set()
for label in LABELS:
    if label=='Category':
        continue
    
    idxs.update(joined_df[joined_df[f'{label}_{rater1}'] != joined_df[f'{label}_{rater2}']].index)
    
joined_df.loc[list(idxs)].reset_index().drop(columns=['Date', 'Publication', 'URL'])

Unnamed: 0,Headline,BreachMentioned_caitlyn,CompanyMentioned_caitlyn,BreachMentioned_decilm,CompanyMentioned_decilm
0,Futures edge higher ahead of economic data,False,False,False,True
1,On Politics ...,False,False,False,True
2,Omarosa Manigault Newman releases secret recor...,False,False,False,True
3,Company posts record $710m net loss as it stru...,False,False,False,True
4,"US reports more than 4,000 COVID-19 deaths in ...",False,False,False,True
...,...,...,...,...,...
78,The Edward Snowden guide to encryption: Fugiti...,False,True,False,False
79,Twitter blocks government 'spy centers' from a...,False,True,False,False
80,Flight attendants can't 'think straight' after...,False,False,False,True
81,Obama leak 'scandal' is overblown,False,False,False,True
