# Setup

The prerequisites to running the code are as follows:

1. Having all dependencies installed. This should have been achieved once you ran `pip install -r requirements.txt` and it finished successfully

2. Having the `datasets` folder with its contents stored in the same directory as this notebook. This should have been achieved when you cloned the repository to your local system

Import dependencies

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Tuple
import random
import os

from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

Set random seed values for reproducibility

In [None]:
seed_val = 17  # 42, 17, 6

random.seed(seed_val)
np.random.seed(seed_val)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# mount = '/content/drive/MyDrive/Uni Stuttgart/Plausibility'
mount = '..'

# Read the data

In [None]:
train = pd.read_json(f'{mount}/datasets/adept/train-dev-test-split/train.json')
dev = pd.read_json(f'{mount}/datasets/adept/train-dev-test-split/val.json')
test = pd.read_json(f'{mount}/datasets/adept/train-dev-test-split/test.json')

train['set'] = 'train'
dev['set'] = 'dev'
test['set'] = 'test'
df = pd.concat([train, dev, test])

In [None]:
df.head()

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,set
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,train
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,train
2,A scene is painted.,A negative scene is painted.,negative,scene,2,3324,train
3,A bone breaks a tooth.,An alleged bone breaks a tooth.,alleged,bone,2,10610,train
4,A trip causes a happening.,A fabulous trip causes a happening.,fabulous,trip,2,14917,train


In [None]:
df['set'].value_counts()

Unnamed: 0_level_0,count
set,Unnamed: 1_level_1
train,12892
test,1612
dev,1611


Only keep label classes 1, 2 and 3 (comparison labels). Map them to 0, 1 and 2 for training convenience:

* 1 => 0
* 2 => 1
* 3 => 2

In [None]:
df = df[df['label'].isin([1, 2, 3])]
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,10805
1,1868
3,1132


In [None]:
df['label'] -= 1
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,10805
0,1868
2,1132


Filter out duplicated data points

In [None]:
df = df.drop_duplicates(subset=['sentence2'], keep='first')
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,10800
0,1866
2,1129


In [None]:
train = df[df['set'] == 'train']
dev = df[df['set'] == 'dev']
test = df[df['set'] == 'test']

# Class balance

In [None]:
train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,8631
0,1503
2,926


The training data is heavily skewed towards class 1. As an option, we can balance the training data by randomly sampling 1500 examples from this class instead of using all of its samples

In [None]:
balance = False  # set to True to balance training data

if balance:
  train = pd.concat([
      train[train['label'].isin([0, 2])],
      train[train['label'] == 1].sample(1500, random_state=seed_val)
  ])
train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,8631
0,1503
2,926


In [None]:
train.shape

(11060, 7)

In [None]:
df = pd.concat([train, dev, test])
df.reset_index(inplace=True, drop=True)
df.shape

(13795, 7)

# Majority prediction

The majority baseline always predicts:
* *equally likely* (2) for the FULL train set
* *less likely* (1) for the BALANCED train set

The classes *equally likely* and *less likely* are distributed equally in the BALANCED train set, but the latter technically outnumbers the former by 4 instances.

In [None]:
def majority_pred(df):
  if balance:
    return [2 for _ in range(df.shape[0])]  # equally likely
  return [1 for _ in range(df.shape[0])]    # less likely

# Cross-balanced evaluation

In [None]:
def f1_one_vs_all(true, pred, class_label):
  true = [0 if v != class_label else 1 for v in true]
  pred = [0 if v != class_label else 1 for v in pred]
  return f1_score(true, pred)

In [None]:
def cross_balanced_eval(df, model_name, dev_or_test='test'):
    if dev_or_test == 'test':
        step = 101 # equal to the number of instances in the smallest class
    else:
        dev_or_test = 'dev'
        step = 102

    full_test_y_true = []
    full_test_y_pred = []
    all_macroF1 = 0
    all_label1_F1 = 0
    all_label2_F2 = 0
    all_label3_F3 = 0
    all_accuracy = 0
    iterations = 0

    for i in tqdm(range(
        1, len(df[df['set'] == dev_or_test].loc[(df['label'] == 1)]['label'].to_list())+1, step)):
        # get df slices containing 101 entries for each label
        new_df = df[df['set'] == dev_or_test].loc[(df['label'] == 0)][i:i+step]
        new_df = pd.concat([new_df, df[df['set'] == dev_or_test].loc[(df['label'] == 1)][i:i+step]])
        new_df = pd.concat([new_df, df[df['set'] == dev_or_test].loc[(df['label'] == 2)][i:i+step]])
        for i in [0, 1, 2]:
            if len(new_df.loc[(new_df['label'] == i)]['label'].to_list()) < step:
                wrap_around = step - len(new_df.loc[(new_df['label'] == i)]['label'].to_list())
                new_df = pd.concat([new_df, df[df['set'] == dev_or_test].loc[(df['label'] == i)][:wrap_around]])

        # evaluation
        test_set_predictions = majority_pred(new_df)
        test_labels = new_df['label'].to_list()

        full_test_y_true.extend(test_labels)
        full_test_y_pred.extend(test_set_predictions)

        macro_f1 = f1_score(test_labels, test_set_predictions, average='macro')
        all_macroF1 += macro_f1

        f1_less, f1_eq, f1_more = f1_one_vs_all(test_labels, test_set_predictions, class_label=0), \
                                f1_one_vs_all(test_labels, test_set_predictions, class_label=1), \
                                f1_one_vs_all(test_labels, test_set_predictions, class_label=2)
        all_label1_F1 += f1_less
        all_label2_F2 += f1_eq
        all_label3_F3 += f1_more

        accuracy = accuracy_score(test_labels, test_set_predictions)
        all_accuracy += accuracy

        iterations += 1

    print("")
    print(f"{model_name} - {dev_or_test} set: average stats")
    avr_MacroF1 = all_macroF1 / iterations
    print(f"macro F1: {avr_MacroF1:.3}")
    avr_accuracy = all_accuracy / iterations
    print(f'Accuracy: {avr_accuracy:.3f}')

    avr_label1_F1 = all_label1_F1 / iterations
    avr_label2_F2 = all_label2_F2 / iterations
    avr_label3_F3 = all_label3_F3 / iterations
    print(f'\nclass-wise F1 scores')
    print(f'1: {avr_label1_F1:.3f}\n2: {avr_label2_F2:.3f}\n3: {avr_label3_F3:.3f}')

    # create confusion matrix
    norm_setting = 'true'
    test_conf_matr = confusion_matrix(full_test_y_true, full_test_y_pred, normalize=norm_setting)
    test_conf_matr = pd.DataFrame(test_conf_matr, columns=['Less likely', 'Equally likely', 'More likely'],
                                index=['Less likely', 'Equally likely', 'More likely'])
    print("")
    print("True\\Predicted:")
    print(test_conf_matr)

In [None]:
print('BALANCED:', balance)
cross_balanced_eval(df, 'Majority Prediction', dev_or_test='test')

BALANCED: False


100%|██████████| 11/11 [00:00<00:00, 38.08it/s]


Majority Prediction - test set: average stats
macro F1: 0.167
Accuracy: 0.333

class-wise F1 scores
1: 0.000
2: 0.500
3: 0.000

True\Predicted:
                Less likely  Equally likely  More likely
Less likely             0.0             1.0          0.0
Equally likely          0.0             1.0          0.0
More likely             0.0             1.0          0.0





# Standard evaluation

In [118]:
test_set_predictions = majority_pred(df[df['set'] == 'test'])
test_labels = df[df['set'] == 'test']['label'].to_list()

In [119]:
macro_f1_test = f1_score(test_labels, test_set_predictions, average='macro')
print(f'Test set: macro F1 = {macro_f1_test:.3f}')

Test set: macro F1 = 0.296


In [120]:
f1_less, f1_eq, f1_more = f1_one_vs_all(test_labels, test_set_predictions, class_label=0), \
                          f1_one_vs_all(test_labels, test_set_predictions, class_label=1), \
                          f1_one_vs_all(test_labels, test_set_predictions, class_label=2)
f1_less, f1_eq, f1_more
print(f"""Test set: one-vs-all F1
1: {f1_less:.3f}
2: {f1_eq:.3f}
3: {f1_more:.3f}""")

Test set: one-vs-all F1
1: 0.000
2: 0.888
3: 0.000


In [121]:
acc_test = accuracy_score(test_labels, test_set_predictions)
print(f'Test set: accuracy = {acc_test:.3f}')

Test set: accuracy = 0.798
