# Evaluation Model Roberta vs. unsere

In [1]:
import pandas as pd
import ast
import numpy as np
from tqdm import tqdm
from sklearn.metrics import cohen_kappa_score
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from tqdm import tqdm
import time
import math
from scipy import stats

In [82]:
df = pd.read_csv("../Dataset/label_annotation.csv")

In [83]:
df['output'] = None

In [84]:
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [85]:
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows", unit="row"):
        output = classifier(row['tweet'])
    
        df.at[index, 'output'] = output[0]

Processing Rows: 100%|██████████| 1041/1041 [00:58<00:00, 17.66row/s]


In [86]:
data = df[['id','tweet', 'output']]

In [87]:
data['tweet'].iloc[1]

'@realDonaldTrump Sen. Jeff Merkley released a document showing a transfer of nearly $10 million from FEMA to ICE and accuses the Trump administration of diverting funds from hurricane relief just as hurricane season was beginning.'

In [89]:
data['output'] = data['output'].astype(str)
data['output'] = data['output'].apply(ast.literal_eval)

def transform_row(row):
    return {item['label']: item['score'] for item in row}

new_data = data['output'].apply(transform_row).apply(pd.Series)

result = pd.concat([data, new_data], axis=1)


In [91]:
ls = result.columns

In [92]:
ls

Index(['id', 'tweet', 'output', 'fear', 'neutral', 'approval', 'realization',
       'nervousness', 'annoyance', 'optimism', 'disgust', 'excitement',
       'sadness', 'disapproval', 'admiration', 'disappointment', 'surprise',
       'anger', 'caring', 'joy', 'confusion', 'amusement', 'desire',
       'embarrassment', 'relief', 'grief', 'love', 'pride', 'curiosity',
       'gratitude', 'remorse'],
      dtype='object')

### SEMO MAPPING FROM CATEGORIES

In [93]:
df_arousal = {
    'anger':'high',
    'fear':'high',
    'disgust':'high',
    'annoyance':'medium',
    'disapproval':'medium',
    'disappointment':'medium',
    'confusion':'medium',
    'nervousness':'medium',
    'remorse':'medium',
    'sadness':'medium',
    'disgust':'low',
    'grief':'low',
    'curiosity':'high',
    'realization':'high',
    'neutral':'low',
    'gratitude':'low',
    'relief':'low',
    'caring':'low',
    'approval':'medium',
    'optimism':'medium',
    'admiration':'medium',
    'pride':'medium',
    'amusement':'medium',
    'joy':'high',
    'love':'high',
    'excitement':'high'
}

df_valence = {
    'anger':'negative',
    'fear':'negative',
    'disgust':'negative',
    'annoyance':'negative',
    'disapproval':'negative',
    'disappointment':'negative',
    'confusion':'negative',
    'nervousness':'negative',
    'remorse':'negative',
    'sadness':'negative',
    'disgust':'negative',
    'grief':'negative',
    'curiosity':'neutral',
    'realization':'neutral',
    'neutral':'neutral',
    'gratitude':'positive',
    'relief':'positive',
    'caring':'positive',
    'approval':'positive',
    'optimism':'positive',
    'admiration':'positive',
    'pride':'positive',
    'amusement':'positive',
    'joy':'positive',
    'love':'positive',
    'excitement':'positive'
}

In [71]:
result.columns

Index(['id', 'tweet', 'output', 'fear', 'neutral', 'approval', 'realization',
       'nervousness', 'annoyance', 'optimism', 'disgust', 'excitement',
       'sadness', 'disapproval', 'admiration', 'disappointment', 'surprise',
       'anger', 'caring', 'joy', 'confusion', 'amusement', 'desire',
       'embarrassment', 'relief', 'grief', 'love', 'pride', 'curiosity',
       'gratitude', 'remorse', 'result'],
      dtype='object')

In [95]:
result['tweet'].iloc[0]

'@tl_trevaskis @Bpage5 @CheriJacobus Which is exactly what is happening. Because trump has labeled antifa a terrorist organization he can now get away with authorizing federal officer to act under that law. This is what I was afraid of.'

In [96]:
emotion_columns = result.columns.drop(['id','tweet', 'output'])
result_new = result
max_values = result[emotion_columns].max(axis=1)
max_columns = result[emotion_columns].eq(max_values, axis=0)
result_new['result'] = max_columns.apply(lambda x: ', '.join(x.index[x]), axis=1)

In [103]:
result['tweet'].iloc[0]

'@tl_trevaskis @Bpage5 @CheriJacobus Which is exactly what is happening. Because trump has labeled antifa a terrorist organization he can now get away with authorizing federal officer to act under that law. This is what I was afraid of.'

In [98]:
test = result[['id','tweet', 'output']]

In [102]:
result = pd.concat([test, result_new], axis=1).loc[:, ~result.columns.duplicated()]

In [106]:
result['valence_extern'] = result['result'].map(df_valence)
result['arousal_extern'] = result['result'].map(df_arousal)

### Andere Modelle

In [107]:
tokenizer_Aurosal = AutoTokenizer.from_pretrained('Arousal_Final_Model/')
tokenizer_Valence = AutoTokenizer.from_pretrained('Valence_Final_Model/')

In [108]:
model_arousal = AutoModelForSequenceClassification.from_pretrained('Arousal_Final_Model/')
model_valence = AutoModelForSequenceClassification.from_pretrained('Valence_Final_Model/')

In [109]:
model_valence.eval()
model_arousal.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [110]:
def predict(text, model,tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    predicted_class = torch.argmax(probabilities, dim=-1).item()
    
    return  predicted_class

def evaluate(data):
    total = len(data)
    required_columns = [ 'valence_pred','arousal_pred']
    for col in required_columns:
        if col not in df.columns:
            data[col] = None  
    for index, row in tqdm(data.iterrows(), total=total, desc="Processing Rows", unit="row"):
        predict_arousal = predict(row['tweet'],model_arousal, tokenizer_Aurosal)
        predict_valence = predict(row['tweet'],model_valence, tokenizer_Valence)
        
        data.at[index, 'valence_pred'] = predict_arousal
        data.at[index, 'arousal_pred'] = predict_valence
        
    return data

In [111]:
result = evaluate(result)

Processing Rows: 100%|██████████| 1041/1041 [03:00<00:00,  5.76row/s]


In [112]:
result.columns

Index(['id', 'tweet', 'output', 'fear', 'neutral', 'approval', 'realization',
       'nervousness', 'annoyance', 'optimism', 'disgust', 'excitement',
       'sadness', 'disapproval', 'admiration', 'disappointment', 'surprise',
       'anger', 'caring', 'joy', 'confusion', 'amusement', 'desire',
       'embarrassment', 'relief', 'grief', 'love', 'pride', 'curiosity',
       'gratitude', 'remorse', 'result', 'valence_extern', 'arousal_extern',
       'valence_pred', 'arousal_pred'],
      dtype='object')

In [3]:
df_annotator_one = result[['id','valence_extern', 'arousal_extern']]
df_annotator_two = result[['id','valence_pred','arousal_pred']]
df_human_ground_trth = pd.read_csv("../Dataset/human_truth.csv",delimiter=";")

In [5]:
df_annotator_one = df_annotator_one.dropna(subset=['valence_extern','arousal_extern'])
df_annotator_two = df_annotator_two.dropna(subset=['valence_pred','arousal_pred'])

In [6]:
df_annotator_one['valence'] = df_annotator_one['valence_extern']
df_annotator_one['arousal'] = df_annotator_one['arousal_extern']
df_annotator_two['arousal'] = df_annotator_two['valence_pred'] 
df_annotator_two['valence'] = df_annotator_two['arousal_pred'] 

In [7]:
inverted_valence_map = {0: 'positive', 1: 'neutral', 2: 'negative'}
inverted_arousal_map = {0: 'high', 1: 'medium', 2: 'low'}

df_annotator_two['arousal'] = df_annotator_two['arousal'].map(inverted_arousal_map)
df_annotator_two['valence'] = df_annotator_two['valence'].map(inverted_valence_map)

In [8]:
def preprocess_df(df):

    valence_map = {'positive': 0, 'neutral': 1, 'negative': 2}
    arousal_map = {'high': 0, 'medium': 1, 'low': 2}
    
    df['Valence'] = df['valence'].map(valence_map)
    df['Arousal'] = df['arousal'].map(arousal_map)
    df = df.dropna(subset=['Valence', 'Arousal'])

    df['Valence'] = df['Valence'].astype(int)
    df['Arousal'] = df['Arousal'].astype(int)

    
    df['Combined'] = df['Valence'].astype(str) + '_' + df['Arousal'].astype(str)
    df = df.dropna(subset=['Valence', 'Arousal', 'Combined'])
    
    return df

In [9]:
def jackknife_kappa(y1, y2):
    n = len(y1)
    kappas = np.zeros(n)
    
    for i in range(n):
        mask = np.ones(n, dtype=bool)
        mask[i] = False
        kappas[i] = cohen_kappa_score(y1[mask], y2[mask])
    
    kappa = cohen_kappa_score(y1, y2)
    se = np.sqrt(((n-1)/n) * np.sum((kappas - np.mean(kappas))**2))
    
    return kappa, se


In [10]:
def calculate_kappa(df_label_one, df_label_two):
    df1_processed = preprocess_df(df_label_one)
    df2_processed = preprocess_df(df_label_two)

    common_index = df1_processed.index.intersection(df2_processed.index)
    df1_processed = df1_processed.loc[common_index]
    df2_processed = df2_processed.loc[common_index]

    df1_processed['Combined'] = df1_processed['Combined'].astype(str)
    df2_processed['Combined'] = df2_processed['Combined'].astype(str)

    kappa_valence, se_valence = jackknife_kappa(df1_processed['Valence'].values, df2_processed['Valence'].values)
    kappa_arousal, se_arousal = jackknife_kappa(df1_processed['Arousal'].values, df2_processed['Arousal'].values)
    kappa_combined, se_combined = jackknife_kappa(df1_processed['Combined'].values, df2_processed['Combined'].values)

    return (kappa_valence, se_valence), (kappa_arousal, se_arousal), (kappa_combined, se_combined)


In [13]:
(valence_extern, se_valence_extern), (arousal_extern, se_arousal_extern), (combined_extern, se_combined_extern) = calculate_kappa(df_annotator_one, df_human_ground_trth)
(valence_own, se_valence_own), (arousal_own, se_arousal_own), (combined_own, se_combined_own) = calculate_kappa(df_annotator_two, df_human_ground_trth)

In [14]:
print(10*'#'+'Groudn_truth | Extern'+ 10*'#')
print(f"Cohen's Kappa for Valence: {valence_extern:.4f}")
print(f"Cohen's Kappa for Valence Fehler: {se_valence_extern:.4f}")
print(f"Cohen's Kappa for Arousal: {arousal_extern:.4f}")
print(f"Cohen's Kappa for Arousal Fehler: {se_arousal_extern:.4f}")
print(f"Cohen's Kappa for Combined: {combined_extern:.4f}")
print(f"Cohen's Kappa for Combined Fehler: {se_combined_extern:.4f}")

##########Groudn_truth | Extern##########
Cohen's Kappa for Valence: 0.0898
Cohen's Kappa for Valence Fehler: 0.0128
Cohen's Kappa for Arousal: 0.0063
Cohen's Kappa for Arousal Fehler: 0.0152
Cohen's Kappa for Combined: 0.0316
Cohen's Kappa for Combined Fehler: 0.0095


In [15]:
print(10*'#'+'Groudn_truth | Unser'+ 10*'#')
print(f"Cohen's Kappa for Valence: {valence_own:.4f}")
print(f"Cohen's Kappa for Valence Fehler: {se_valence_own:.4f}")

print(f"Cohen's Kappa for Arousal: {arousal_own:.4f}")
print(f"Cohen's Kappa for Arousal Fehler: {se_arousal_own:.4f}")

print(f"Cohen's Kappa for Combined: {combined_own:.4f}")
print(f"Cohen's Kappa for Combined Fehler: {se_combined_own:.4f}")

##########Groudn_truth | Unser##########
Cohen's Kappa for Valence: 0.5689
Cohen's Kappa for Valence Fehler: 0.0268
Cohen's Kappa for Arousal: 0.4998
Cohen's Kappa for Arousal Fehler: 0.0218
Cohen's Kappa for Combined: 0.4279
Cohen's Kappa for Combined Fehler: 0.0192


In [20]:
def calculate_z_score(kappa1, se1, kappa2, se2):
    """
    Berechnet den Z-Wert für den Vergleich zweier Kappa-Werte und gibt den p-Wert als Logarithmus zurück.
    
    :param kappa1: Erster Kappa-Wert
    :param se1: Standardfehler des ersten Kappa-Werts
    :param kappa2: Zweiter Kappa-Wert
    :param se2: Standardfehler des zweiten Kappa-Werts
    :return: Z-Wert und logarithmierter p-Wert
    """
    z = (kappa1 - kappa2) / math.sqrt(se1**2 + se2**2)
    p_value = 2 * (1 - stats.norm.cdf(abs(z)))  # Zweiseitiger p-Wert
    log_p_value = math.log10(p_value) if p_value > 0 else float('-inf')
    return z, log_p_value


In [21]:
def format_p_value(log_p_value):
    """
    Formatiert den logarithmierten p-Wert in eine lesbare wissenschaftliche Notation.
    """
    if log_p_value == float('-inf'):
        return "p < 1e-324"  # Kleinster darstellbarer Wert in Python
    elif log_p_value > -4:  # Für p-Werte größer als 0.0001
        return f"{10**log_p_value:.4f}"
    else:
        return f"{10**log_p_value:.2e}"


In [23]:
z_score, p_value = calculate_z_score(combined_own, se_combined_own, combined_extern, se_combined_extern)
print(f"Z-Wert: {z_score:.4f}")
print(f"p-Wert: {format_p_value(p_value)}")

alpha = 0.05 
if p_value < alpha:
    print("Der Unterschied zwischen den Kappa-Werten ist statistisch signifikant.")
else:
    print("Der Unterschied zwischen den Kappa-Werten ist nicht statistisch signifikant.")



Z-Wert: 18.5122
p-Wert: p < 1e-324
Der Unterschied zwischen den Kappa-Werten ist statistisch signifikant.


In [26]:
z_score_v, p_value_v = calculate_z_score( valence_own, se_valence_own,valence_extern, se_valence_extern)
z_score_a, p_value_a = calculate_z_score(arousal_own, se_arousal_own, arousal_extern, se_arousal_extern)

In [27]:
print(f"Z-Wert_Valence: {z_score_v:.4f}")
print(f"p-Wert_Arousal: {format_p_value(p_value_v)}")

Z-Wert_Valence: 16.1408
p-Wert_Arousal: p < 1e-324


In [28]:
print(f"Z-Wert_Valence: {z_score_a:.4f}")
print(f"p-Wert_Arousal: {format_p_value(p_value_a)}")

Z-Wert_Valence: 18.5542
p-Wert_Arousal: p < 1e-324
