# Kappa Values

In this notebook, the Kappa values are calculated in order to find the most suitable model for tweet annotation, based on the given annotations.

In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import math
from scipy import stats
from scipy.special import log_ndtr

In [2]:
df_human_ground_trth = pd.read_csv("../Dataset/human_truth.csv",delimiter=";")
df_annotator_one = pd.read_csv("../Dataset/annotator_one.csv")
df_annotator_two = pd.read_csv("../Dataset/annotator_two.csv")
df_gpt_3_5 = pd.read_csv("../Dataset/GPT-3.5.csv")
df_gpt_4o_mini = pd.read_csv("../Dataset/GPT4omini.csv")

In [36]:
df_annotator_one = df_annotator_one.dropna(subset=['valence','arousal'])
df_annotator_two = df_annotator_two.dropna(subset=['valence','arousal'])

In [5]:
def preprocess_df(df):
    valence_map = {'positive': 0, 'neutral': 1, 'negative': 2}
    arousal_map = {'high': 0, 'medium': 1, 'low': 2}
    
    df['Valence'] = df['valence'].map(valence_map)
    df['Arousal'] = df['arousal'].map(arousal_map)
    df = df.dropna(subset=['Valence', 'Arousal'])

    df['Valence'] = df['Valence'].astype(int)
    df['Arousal'] = df['Arousal'].astype(int)

    
    df['Combined'] = df['Valence'].astype(str) + '_' + df['Arousal'].astype(str)
    df = df.dropna(subset=['Valence', 'Arousal', 'Combined'])
    
    return df

In [6]:
def jackknife_kappa(y1, y2):
    n = len(y1)
    kappas = np.zeros(n)
    
    for i in range(n):
        mask = np.ones(n, dtype=bool)
        mask[i] = False
        kappas[i] = cohen_kappa_score(y1[mask], y2[mask])
    
    kappa = cohen_kappa_score(y1, y2)
    se = np.sqrt(((n-1)/n) * np.sum((kappas - np.mean(kappas))**2))
    
    return kappa, se


In [7]:
def calculate_kappa(df_label_one, df_label_two):
    df1_processed = preprocess_df(df_label_one)
    df2_processed = preprocess_df(df_label_two)

    common_index = df1_processed.index.intersection(df2_processed.index)
    df1_processed = df1_processed.loc[common_index]
    df2_processed = df2_processed.loc[common_index]

    df1_processed['Combined'] = df1_processed['Combined'].astype(str)
    df2_processed['Combined'] = df2_processed['Combined'].astype(str)

    kappa_valence, se_valence = jackknife_kappa(df1_processed['Valence'].values, df2_processed['Valence'].values)
    kappa_arousal, se_arousal = jackknife_kappa(df1_processed['Arousal'].values, df2_processed['Arousal'].values)
    kappa_combined, se_combined = jackknife_kappa(df1_processed['Combined'].values, df2_processed['Combined'].values)

    return (kappa_valence, se_valence), (kappa_arousal, se_arousal), (kappa_combined, se_combined)

In [8]:
(ann_ann_valence, se_ann_ann_valence), (ann_ann_arousal, se_ann_ann_arousal), (ann_ann_combined, se_ann_ann_combined) = calculate_kappa(df_annotator_one, df_annotator_two)
(ground_gpt3_valence, se_ground_gpt3_valence), (ground_gpt3_arousal, se_ground_gpt3_arousal), (ground_gpt3_combined, se_ann_ground_gpt3_combined)= calculate_kappa(df_human_ground_trth, df_gpt_3_5)
(ground_gpt4o_valence, se_ground_gpt4o_valence), (ground_gpt4o_arousal, se_ground_gpt4o_arousal), (ground_gpt4o_combined, se_ground_gpt4o_combined)= calculate_kappa(df_human_ground_trth, df_gpt_4o_mini)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Valence'] = df['Valence'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Arousal'] = df['Arousal'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Combined'] = df['Valence'].astype(str) + '_' + df['Arousal'].astype(str)
A value is trying to be set on a copy of a s

In [9]:
print(10*'#'+'Annotator_one | Anotator_two'+ 10*'#')
print(f"Cohen's Kappa for Valence: {ann_ann_valence:.4f}")
print(f"Cohen's Kappa for Valence Fehler: {se_ann_ann_valence:.4f}")
print(f"Cohen's Kappa for Arousal: {ann_ann_arousal:.4f}")
print(f"Cohen's Kappa for Arousal Fehler: {se_ann_ann_arousal:.4f}")
print(f"Cohen's Kappa for Combined: {ann_ann_combined:.4f}")
print(f"Cohen's Kappa for Combined Fehler: {se_ann_ann_combined:.4f}")
print(10*'#'+'Human Ground Truth | GPT 3.5 Turbo'+ 10*'#')
print(f"Cohen's Kappa for Valence: {ground_gpt3_valence:.4f}")
print(f"Cohen's Kappa for Valence Fehler: {se_ground_gpt3_valence:.4f}")
print(f"Cohen's Kappa for Arousal: {ground_gpt3_arousal:.4f}")
print(f"Cohen's Kappa for Arousal Fehler: {se_ground_gpt3_arousal:.4f}")
print(f"Cohen's Kappa for Combined: {ground_gpt3_combined:.4f}")
print(f"Cohen's Kappa for Combined Fehler: {se_ann_ground_gpt3_combined:.4f}")
print(10*'#'+'Human Ground Truth | GPT 4o mini'+ 10*'#')
print(f"Cohen's Kappa for Valence: {ground_gpt4o_valence:.4f}")
print(f"Cohen's Kappa for Valence Fehler: {se_ground_gpt4o_valence:.4f}")
print(f"Cohen's Kappa for Arousal: {ground_gpt4o_arousal:.4f}")
print(f"Cohen's Kappa for Arousal Fehler: {se_ground_gpt4o_arousal:.4f}")
print(f"Cohen's Kappa for Combined: {ground_gpt4o_combined:.4f}")
print(f"Cohen's Kappa for Combined Fehler: {se_ground_gpt4o_combined:.4f}")

##########Annotator_one | Anotator_two##########
Cohen's Kappa for Valence: 0.6039
Cohen's Kappa for Valence Fehler: 0.0272
Cohen's Kappa for Arousal: 0.4340
Cohen's Kappa for Arousal Fehler: 0.0234
Cohen's Kappa for Combined: 0.3883
Cohen's Kappa for Combined Fehler: 0.0200
##########Human Ground Truth | GPT 3.5 Turbo##########
Cohen's Kappa for Valence: 0.8274
Cohen's Kappa for Valence Fehler: 0.0189
Cohen's Kappa for Arousal: 0.7385
Cohen's Kappa for Arousal Fehler: 0.0176
Cohen's Kappa for Combined: 0.7193
Cohen's Kappa for Combined Fehler: 0.0165
##########Human Ground Truth | GPT 4o mini##########
Cohen's Kappa for Valence: 0.6691
Cohen's Kappa for Valence Fehler: 0.0241
Cohen's Kappa for Arousal: 0.4580
Cohen's Kappa for Arousal Fehler: 0.0228
Cohen's Kappa for Combined: 0.4264
Cohen's Kappa for Combined Fehler: 0.0192


In [20]:
def calculate_z_score(kappa1, se1, kappa2, se2):
    z = (kappa1 - kappa2) / math.sqrt(se1**2 + se2**2)
    log_p_value = log_ndtr(-abs(z)) + np.log(2)  # Log des zweiseitigen p-Werts
    return z, log_p_value

z_score, log_p_value = calculate_z_score(ground_gpt3_combined, se_ann_ground_gpt3_combined, ground_gpt4o_combined, se_ground_gpt4o_combined)
print(f"Z-Wert: {z_score:.4f}")
print(f"p-Wert: {np.exp(log_p_value):.4e}")

Z-Wert: 11.5732
p-Wert: 5.6355e-31
