In [1]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score
from statsmodels.stats.inter_rater import fleiss_kappa
import pandas as pd
import numpy as np
import warnings
import re
import os
from tools import fleiss_pivot

from IPython.display import display_markdown

In [2]:
def to_bool(string:str) -> bool:
    string = str(string).strip().casefold()
    if string == 'nan':
        return False
    if string in ('true', 'yes', '1'):
        return True
    if string in ('false', 'no', '0'):
        return False
    
    # some random stackoverflow said not not is faster than bool()
    return not not string 

In [3]:
os.chdir('./classify/irr') # sometimes necessary to get the right path

In [10]:
IRR_FILES = ['./train_set_human/leon.csv', './train_set_human/lilli.csv']
IRR_FILES = ['./test_set_human/alex.csv', './test_set_human/caitlyn.csv']
# IRR_FILES = ['./test_set_gpt/caitlyn.csv', './test_set_gpt/gpt_classified.csv']
KEY_COLUMNS = ['Date','Publication','Headline','URL']
LABELS = set()

# Create empty key dataframe
key = pd.DataFrame(columns=KEY_COLUMNS)
raters = dict()
for path in IRR_FILES:
    df = pd.read_csv(path)
    rater = os.path.basename(path).split('.')[0]
    
     # Drop all unnamed columns
    df = df.drop(columns=[col for col in df.columns if col.startswith('Unnamed: ')])
    
    # TODO: depending on our format, we will need to cast certain columns to bool
    # For now, convert all columns to bool and assume we match companies correctly
    for col in df.columns:
        if col in KEY_COLUMNS:
            continue
        df[col] = df[col].apply(to_bool)
        LABELS.add(col)
    
    # Save
    df.set_index(KEY_COLUMNS, inplace=True)
    raters[rater] = df

# Truncate all dataframes to have the same keys (and drop unnamed: columns)
all_keys = raters[list(raters.keys())[0]].index
for rater in raters:
    all_keys = all_keys.intersection(raters[rater].index)
for rater in raters:
    raters[rater] = raters[rater].loc[all_keys]

In [11]:
raters[rater]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,BreachMentioned,CompanyMentioned
Date,Publication,Headline,URL,Unnamed: 4_level_1,Unnamed: 5_level_1
20160329,Washington Post,You can soon get unlimited data on AT&T U-verse — but it comes with a big catch,https://www.washingtonpost.com/news/the-switch/wp/2016/03/29/you-can-soon-get-unlimited-data-on-att-u-verse-but-it-comes-with-a-big-catch/,False,True
20180212,Washington Post,"Lending by big banks to small businesses hits a record high, study finds",https://www.washingtonpost.com/news/on-small-business/wp/2018/02/12/lending-by-big-banks-to-small-businesses-hits-a-record-high-study-finds/,False,False
20220614,The Guardian,Yellowstone National park closed after record rain and major flooding,https://www.theguardian.com/us-news/2022/jun/14/yellowstone-national-park-flooding-rain,False,False
20210908,Washington Post,"U.S. is likely to breach debt ceiling soon unless Congress acts, Yellen says",https://www.washingtonpost.com/us-policy/2021/09/08/democrats-september-debt-ceiling-reconciliation/,False,True
20210825,New York Post,"Jan 6. House committee releases sweeping records request, including call logs of Trump",https://nypost.com/2021/08/25/jan-6-house-committee-releases-sweeping-records-request/,False,True
...,...,...,...,...,...
20120112,USA Today,Weak Dec. retail sales push 2011 to a record gain,http://www.usatoday.com/money/economy/story/2012-01-12/december-retail-sales/52513014/1,False,False
20210513,New York Times,Biden Says Colonial Pipeline Is Nearing Full Capacity After Hack,https://www.nytimes.com/live/2021/05/13/us/joe-biden-news/,False,True
20200609,Washington Post,"From 100-degree heat to record cold: Howling winds bring sudden seasonal reversal in West, High Plains",https://www.washingtonpost.com/weather/2020/06/09/100-degree-heat-record-cold-howling-winds-bring-sudden-seasonal-reversal-west-high-plains/,False,False
20221201,FOX,LAPD seeks search warrant for Reddit to identify who leaked racist council discussion,//web.archive.orghttps://www.foxnews.com/politics/lapd-seeks-search-warrant-reddit-identify-who-leaked-racist-council-discussion,False,False


In [12]:
# Classification report for a row that takes on two OR MORE values
# ex: "Category": 'A' or 'B' or 'C'
REPORT_COLUMN = 'CompanyMentioned'
rater1 = list(raters.keys())[0]
rater2 = list(raters.keys())[1]

print(
    classification_report(
        raters[rater1][REPORT_COLUMN],
        raters[rater2][REPORT_COLUMN],
    )
)
category_report = classification_report(
    raters[rater1][REPORT_COLUMN],
    raters[rater2][REPORT_COLUMN],
    output_dict=True
)

category_irr = fleiss_kappa(
    fleiss_pivot([
        raters[rater1],
        raters[rater2],
    ],
    REPORT_COLUMN)
)

print("Fleiss:", category_irr, sep=' ')

              precision    recall  f1-score   support

       False       0.83      0.81      0.82       135
        True       0.62      0.65      0.63        65

    accuracy                           0.76       200
   macro avg       0.72      0.73      0.72       200
weighted avg       0.76      0.76      0.76       200



Fleiss: 0.44805834811748474


In [13]:
# Create a markdown table for every label
rater1 = list(raters.keys())[0]
rater2 = list(raters.keys())[1]

f1_scores = list()
irr_scores = list()
md = '| Label | IRR | Precision | Recall | F1 | Support | In Agreement | Disagreement |\n|---|---|---|---|---|---|---|---|\n'
for label in LABELS:
    if label=='Category':
        continue
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        irr = fleiss_kappa(fleiss_pivot([
            raters[rater1],
            raters[rater2],
        ], label))
        p, r, f, s = precision_recall_fscore_support(raters[rater1][label].fillna(False), raters[rater2][label].fillna(False), average='binary')
    
    # Overwrite with support (max of both)
    s = max(raters[rater1][label].sum(), raters[rater2][label].sum())
    
    agree    = (raters[rater1][label] == raters[rater2][label]).sum()
    disagree = (raters[rater1][label] != raters[rater2][label]).sum()
    
    p, r, f, s = float(p), float(r), float(f), int(s) if s is not None else 0
    md += f'| {label} | {irr:.2f} | {p:.2f} | {r:.2f} | {f:.2f} | {s} | {agree} | {disagree} |\n'
    f1_scores.append(f)
    irr_scores.append(irr)
    
display_markdown(md, raw=True)
print('Average F1:', np.nanmean(f1_scores), sep=' ')
print('Average IRR:', np.nanmean(irr_scores), sep=' ')

| Label | IRR | Precision | Recall | F1 | Support | In Agreement | Disagreement |
|---|---|---|---|---|---|---|---|
| BreachMentioned | 0.58 | 0.50 | 0.75 | 0.60 | 12 | 192 | 8 |
| CompanyMentioned | 0.45 | 0.62 | 0.65 | 0.63 | 68 | 151 | 49 |


Average F1: 0.6157894736842104
Average IRR: 0.5135028582692687


In [14]:
# Calculate overall Avergage F1 and Avergage IRR
category_f1 = category_report['macro avg']['f1-score']
category_irr = category_irr
labels_f1 = np.nanmean(f1_scores)
labels_irr = np.nanmean(irr_scores)

ncat = max(raters[rater1][REPORT_COLUMN].nunique(), raters[rater2][REPORT_COLUMN].nunique())

print('Category F1:', category_f1, sep=' ')
print('Category IRR:', category_irr, sep=' ')
print('Labels F1:', labels_f1, sep=' ')
print('Labels IRR:', labels_irr, sep=' ')
print('Overall F1:', (ncat * category_f1 + len(LABELS) * labels_f1) / (ncat + len(LABELS)), sep=' ')
print('Overall IRR:', (ncat * category_irr + len(LABELS) * labels_irr) / (ncat + len(LABELS)), sep=' ')

Category F1: 0.7240291740587423
Category IRR: 0.44805834811748474
Labels F1: 0.6157894736842104
Labels IRR: 0.5135028582692687
Overall F1: 0.6699093238714764
Overall IRR: 0.4807806031933767


In [15]:
# Where do we disagree? Print them all out!

joined_df = raters[rater1].join(raters[rater2], lsuffix=f'_{rater1}', rsuffix=f'_{rater2}')
idxs = set()
for label in LABELS:
    if label=='Category':
        continue
    
    idxs.update(joined_df[joined_df[f'{label}_{rater1}'] != joined_df[f'{label}_{rater2}']].index)
    
joined_df.loc[list(idxs)].reset_index().drop(columns=['Date', 'Publication', 'URL'])

Unnamed: 0,Headline,BreachMentioned_leon,CompanyMentioned_leon,BreachMentioned_lilli,CompanyMentioned_lilli
0,"Colorado sees record breaking voter turnout, 2...",False,False,False,True
1,Iran cover-up of Covid-19 deaths revealed by d...,False,False,False,True
2,2 Here are the juiciest Colin Powell comment...,False,False,False,True
3,Murdoch's former top lieutenant Rebekah Brooks...,False,True,False,False
4,We were leaked the Panama Papers. Here’s how t...,False,False,False,True
5,Steve Bannon on Cambridge Analytica 'Facebook...,True,True,False,True
6,Saudi Arabia picks on Norway’s human rights re...,False,True,False,False
7,Lia Thomas broke no records at the NCAA champi...,False,False,False,True
8,Immigration Tech workers protest data mining ...,False,True,False,False
9,LAPD cops use controversial Palantir data-mini...,False,True,False,False
