In [14]:
import torch.backends.mps
# autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# imports
import pandas as pd
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
import logging
import matplotlib.pyplot as plt
import numpy as np

# Prepare Test Dataset

In [16]:
# helper function to convert df in a format that cross encoder can understand
def get_dataset_from_df(col_text_a, col_text_b, label_col, df):
    dataset = []
    for index, row in df.iterrows():
        text_a = row[col_text_a]
        text_b = row[col_text_b]
        label = row[label_col]
        dataset.append((text_a, text_b, label))
        dataset.append((text_b, text_a, label))
    return dataset

In [23]:
# Load the test dataset
test_df = pd.read_csv('../data/labeled_data/test_data.csv', sep=';')
test_df['title_description'] = test_df['title'] + ' ' + test_df['description']
# create the required test data structure for evaluation
test_dataset_list = get_dataset_from_df('goal', 'title_description', 'label_manual', test_df)

# split the dataset into sentence pairs (input) and labels (true output)
val_sentence_pairs = [[t[0], t[1]] for t in test_dataset_list]
labels = [t[2] for t in test_dataset_list]

In [56]:
# add a col to test df with randomly labels (0 or 1) 
import random

test_df['label_random'] = [random.choice([0, 1]) for i in range(test_df.shape[0])]

# check accuracy of random labels
test_df['check_random'] = test_df['label_manual'] == test_df['label_random']

test_df['check_random'].value_counts(normalize=True)


check_random
False    0.508929
True     0.491071
Name: proportion, dtype: float64

In [25]:
# make a df for predictions
predictions_df = pd.DataFrame(get_dataset_from_df('goal', 'title_description', 'label_manual', test_df), columns=['goal', 'title_description', 'label_manual'])
predictions_df

Unnamed: 0,goal,title_description,label_manual
0,I would like to have firm values.,ESC in Danimarca: volontariato in una scuola s...,1
1,ESC in Danimarca: volontariato in una scuola s...,I would like to have firm values.,1
2,I would like to have other people trust me.,Volunteering in Tolnai Szent István Catholic H...,1
3,Volunteering in Tolnai Szent István Catholic H...,I would like to have other people trust me.,1
4,I would like to be loyal.,VOLUNTEERING AT TIRANT LO BLANC PRIMARY SCHOOL...,1
...,...,...,...
219,Get closer to nature while volunteering in Kau...,I would like to feel safe and secure.,0
220,Decrease amount of unnecessary and tedious tas...,trail steward - friends of okanagan rail trail...,1
221,trail steward - friends of okanagan rail trail...,Decrease amount of unnecessary and tedious tas...,1
222,I would like to strengthen my social competency,Grants Administrator\n \n \n...,0


In [33]:
# based on title_description get the sdt-cluster and gpt_sector from test_df
predictions_df['sdt_cluster'] = None
predictions_df['gpt_sector'] = None
predictions_df['text_len'] = None

pred_index = 0
for index, row in test_df.iterrows():
    predictions_df.at[pred_index, 'sdt_cluster'] = row['sdt-cluster']
    predictions_df.at[pred_index, 'gpt_sector'] = row['gpt_sector']
    predictions_df.at[pred_index, 'text_len'] = len(row['title_description'])
    pred_index += 1
    predictions_df.at[pred_index, 'sdt_cluster'] = row['sdt-cluster']
    predictions_df.at[pred_index, 'gpt_sector'] = row['gpt_sector']
    predictions_df.at[pred_index, 'text_len'] = len(row['title_description'])
    pred_index += 1
predictions_df

Unnamed: 0,goal,title_description,label_manual,sdt_cluster,gpt_sector,text_len
0,I would like to have firm values.,ESC in Danimarca: volontariato in una scuola s...,1,autonomy,Bildung,2447
1,ESC in Danimarca: volontariato in una scuola s...,I would like to have firm values.,1,autonomy,Bildung,2447
2,I would like to have other people trust me.,Volunteering in Tolnai Szent István Catholic H...,1,autonomy,Bildung,2244
3,Volunteering in Tolnai Szent István Catholic H...,I would like to have other people trust me.,1,autonomy,Bildung,2244
4,I would like to be loyal.,VOLUNTEERING AT TIRANT LO BLANC PRIMARY SCHOOL...,1,autonomy,Bildung,1958
...,...,...,...,...,...,...
219,Get closer to nature while volunteering in Kau...,I would like to feel safe and secure.,0,relatedness,"Umwelt, Natur, Tierschutz",1868
220,Decrease amount of unnecessary and tedious tas...,trail steward - friends of okanagan rail trail...,1,relatedness,"Umwelt, Natur, Tierschutz",683
221,trail steward - friends of okanagan rail trail...,Decrease amount of unnecessary and tedious tas...,1,relatedness,"Umwelt, Natur, Tierschutz",683
222,I would like to strengthen my social competency,Grants Administrator\n \n \n...,0,relatedness,"Umwelt, Natur, Tierschutz",699


In [37]:
# Value Counts of sdt_cluster and gpt_sector
print(predictions_df.value_counts('sdt_cluster'))

sdt_cluster
relatedness    90
autonomy       72
competence     62
Name: count, dtype: int64


In [None]:
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

In [38]:
# Configure Models and best params
model_paths = ['cross-encoder/stsb-roberta-base','../data/models/best_qwen', '../data/models/best_gpt', '../data/models/best_own']
names = ['basemodel-roberta','qwen2', 'gpt', 'own_we']
batch_sizes = [32, 15, 32, 5, 32]

# Create new df for results
results_df = pd.DataFrame()

# configure logging
logging.basicConfig(level=logging.INFO, force=True)

# for all Models
for i, model_path in enumerate(model_paths):
    # initialize model
    model = CrossEncoder(model_path, num_labels=1, device=device)
    
    # get and store predictions
    predictions = model.predict(val_sentence_pairs) # batch size maybe here
    predictions_df[f'pred_{names[i]}'] = [1 if x >= 0.5 else 0 for x in predictions]
    
    # store df and check threshold
    test_df = pd.DataFrame({'label_manual': labels, 'label_pred': predictions, 'float_pred': predictions})
    test_df['label_pred'] = test_df['label_pred'].apply(lambda x: 1 if x >= 0.5 else 0)
    
    # print amount of predicted labels % and absolute
    print(f"Counts of Model {names[i]}")
    print(test_df.value_counts('label_pred', normalize=True))
    print(test_df.value_counts('label_pred', normalize=False))
    print()
    
    # Calculate TP, FP, TN, FN
    test_df['tp'] = (test_df['label_manual'] == 1) & (test_df['label_pred'] == 1)
    test_df['fp'] = (test_df['label_manual'] == 0) & (test_df['label_pred'] == 1)
    test_df['tn'] = (test_df['label_manual'] == 0) & (test_df['label_pred'] == 0)
    test_df['fn'] = (test_df['label_manual'] == 1) & (test_df['label_pred'] == 0)

    tp = test_df['tp'].sum()
    fp = test_df['fp'].sum()
    tn = test_df['tn'].sum()
    fn = test_df['fn'].sum()
    
    # calculate metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    accuracy = (tp + tn) / test_df.shape[0]
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    cross_entropy = - (test_df['label_manual'] * np.log(test_df['float_pred']) + (1 - test_df['label_manual']) * np.log(1 - test_df['float_pred'])).mean()
    
    # print results
    print(f"Results of Model {names[i]}")
    print(f"count Matches: {test_df['tp'].sum() + test_df['tn'].sum()}, count Misses: {test_df['fp'].sum() + test_df['fn'].sum()}")
    print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}, Specificity: {specificity}, Cross-Entropy: {cross_entropy}")
    print()
    
    # save to results
    new_results = pd.DataFrame({
        'Model': [names[i]],
        'TP': [tp],
        'FP': [fp],
        'TN': [tn],
        'FN': [fn],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1],
        'Specificity': [specificity],
        'Cross-Entropy': [cross_entropy]
    })

    # concat dfs
    results_df = pd.concat([results_df, new_results], ignore_index=True)
    del model
    
# show results df
results_df



Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Counts of Model basemodel-roberta
label_pred
0    0.602679
1    0.397321
Name: proportion, dtype: float64
label_pred
0    135
1     89
Name: count, dtype: int64

Results of Model basemodel-roberta
count Matches: 115, count Misses: 109
TP: 44, FP: 45, TN: 71, FN: 64
Accuracy: 0.5133928571428571, Precision: 0.4943820224719101, Recall: 0.4074074074074074, F1-Score: 0.4467005076142132, Specificity: 0.6120689655172413, Cross-Entropy: 0.6728086973557116


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Counts of Model qwen2
label_pred
1    0.625
0    0.375
Name: proportion, dtype: float64
label_pred
1    140
0     84
Name: count, dtype: int64

Results of Model qwen2
count Matches: 130, count Misses: 94
TP: 77, FP: 63, TN: 53, FN: 31
Accuracy: 0.5803571428571429, Precision: 0.55, Recall: 0.7129629629629629, F1-Score: 0.6209677419354838, Specificity: 0.45689655172413796, Cross-Entropy: 1.074370248106009


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Counts of Model gpt
label_pred
1    0.852679
0    0.147321
Name: proportion, dtype: float64
label_pred
1    191
0     33
Name: count, dtype: int64

Results of Model gpt
count Matches: 129, count Misses: 95
TP: 102, FP: 89, TN: 27, FN: 6
Accuracy: 0.5758928571428571, Precision: 0.5340314136125655, Recall: 0.9444444444444444, F1-Score: 0.6822742474916388, Specificity: 0.23275862068965517, Cross-Entropy: 3.2469167348845076


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Counts of Model own_we
label_pred
0    0.700893
1    0.299107
Name: proportion, dtype: float64
label_pred
0    157
1     67
Name: count, dtype: int64

Results of Model own_we
count Matches: 143, count Misses: 81
TP: 47, FP: 20, TN: 96, FN: 61
Accuracy: 0.6383928571428571, Precision: 0.7014925373134329, Recall: 0.4351851851851852, F1-Score: 0.5371428571428573, Specificity: 0.8275862068965517, Cross-Entropy: 2.7472129242457544


Unnamed: 0,Model,TP,FP,TN,FN,Accuracy,Precision,Recall,F1-Score,Specificity,Cross-Entropy
0,basemodel-roberta,44,45,71,64,0.513393,0.494382,0.407407,0.446701,0.612069,0.672809
1,qwen2,77,63,53,31,0.580357,0.55,0.712963,0.620968,0.456897,1.07437
2,gpt,102,89,27,6,0.575893,0.534031,0.944444,0.682274,0.232759,3.246917
3,own_we,47,20,96,61,0.638393,0.701493,0.435185,0.537143,0.827586,2.747213


In [39]:
# save to csv
predictions_df.to_csv('../data/evaluation_results/predictions_final.csv', index=False, sep=';')

# McNemar Test

In [84]:
from itertools import combinations
from statsmodels.stats.contingency_tables import mcnemar

# get all combinations of two models
combinations = list(combinations(predictions_df.columns[3:], 2))


# create a df to store the results
mcnemar_df = pd.DataFrame(columns=['model_1', 'model_2', 'statistic', 'p-value'])

# for all pairs of Models
for comb in combinations:
    # make a cross table
    cross_table = pd.crosstab(predictions_df[comb[0]], predictions_df[comb[1]])
    
    # print Models
    print(f"Combination: {comb}")
    
    # calculate mcnemar
    result = mcnemar(cross_table, exact=True)
    
    # save to df
    mcnemar_df = pd.concat([mcnemar_df, pd.DataFrame({'model_1': [comb[0]], 'model_2': [comb[1]], 'statistic': [result.statistic], 'p-value': [result.pvalue]})], ignore_index=True)
    
    # print results
    print(f"Statistic: {result.statistic}, p-value: {result.pvalue}")
    print()

Combination: ('pred_basemodel-roberta', 'pred_qwen2')
Statistic: 34.0, p-value: 3.2837572373979797e-06

Combination: ('pred_basemodel-roberta', 'pred_gpt')
Statistic: 9.0, p-value: 1.7092920119428793e-23

Combination: ('pred_basemodel-roberta', 'pred_own_we')
Statistic: 44.0, p-value: 0.044762094156973624

Combination: ('pred_qwen2', 'pred_gpt')
Statistic: 7.0, p-value: 4.2705216207923607e-11

Combination: ('pred_qwen2', 'pred_own_we')
Statistic: 16.0, p-value: 1.87131274564586e-13

Combination: ('pred_gpt', 'pred_own_we')
Statistic: 5.0, p-value: 3.1876229928981844e-32


  mcnemar_df = pd.concat([mcnemar_df, pd.DataFrame({'model_1': [comb[0]], 'model_2': [comb[1]], 'statistic': [result.statistic], 'p-value': [result.pvalue]})], ignore_index=True)


In [85]:
# show results
mcnemar_df

Unnamed: 0,model_1,model_2,statistic,p-value
0,pred_basemodel-roberta,pred_qwen2,34.0,3.283757e-06
1,pred_basemodel-roberta,pred_gpt,9.0,1.7092920000000002e-23
2,pred_basemodel-roberta,pred_own_we,44.0,0.04476209
3,pred_qwen2,pred_gpt,7.0,4.270522e-11
4,pred_qwen2,pred_own_we,16.0,1.871313e-13
5,pred_gpt,pred_own_we,5.0,3.187623e-32
