# Imports

In [1]:
# !pip install PyMuPDF 
# !pip install transformers torch
# !pip install --upgrade ipywidgets

In [2]:
import fitz  # PyMuPDF
import torch
import logging
import spacy
import os
import string
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split, LeaveOneOut, StratifiedKFold
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from scipy.sparse import hstack
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, TFAutoModel
from collections import Counter

# Loading Data

In [3]:
df = pd.read_csv('pre-processed data.csv').set_index(['article_id' , 'paragraph_id'], inplace=False)
df['pos_tags'] = df['pos_tags'].apply(ast.literal_eval)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,original_text,text,group,text before lemmatization,pos_tags
article_id,paragraph_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,Provincies willen aan de slag met versoepeling...,provincie willen slag versoepeling stikstofreg...,Bouw & Vastgoed,provincies willen slag versoepeling stikstofre...,"[(provincie, NOUN), (willen, VERB), (slag, NOU..."
1,2,Het draait allemaal om de drempelwaarde voor e...,draaien allemaal drempelwaran stikstofvergunni...,Bouw & Vastgoed,draait allemaal drempelwaarde stikstofvergunni...,"[(draaien, VERB), (allemaal, ADV), (drempelwar..."
1,3,Met een hogere drempelwaarde zouden minder ver...,hoog drempelwaard vergunning [NEWLINE] aangevo...,Bouw & Vastgoed,hogere drempelwaarde vergunningen [NEWLINE] aa...,"[(hoog, ADJ), (drempelwaard, NOUN), (vergunnin..."
1,4,In het hoofdlijnenakkoord hebben de vier coali...,hoofdlijnenakkoord vier coalitiepartij afsprek...,Bouw & Vastgoed,hoofdlijnenakkoord vier coalitiepartijen afges...,"[(hoofdlijnenakkoord, PROPN), (vier, NUM), (co..."
1,5,De ondergrens is al langer onderwerp van discu...,ondergren lang onderwerp discussie huidig Nede...,Bouw & Vastgoed,ondergrens langer onderwerp discussie huidige ...,"[(ondergren, VERB), (lang, ADJ), (onderwerp, N..."
...,...,...,...,...,...,...
118,2,Telgenkamp vestigt haar hoop voor de korte ter...,telgenkamp vestigen hoop kort termijn twee cru...,Zorg,telgenkamp vestigt hoop korte termijn twee cru...,"[(telgenkamp, NOUN), (vestigen, VERB), (hoop, ..."
119,1,Waarom verzekeraars inkomsten uit zwart werk w...,verzekeraar inkomst zwart werk vergoeden [NEWL...,Zorg,verzekeraars inkomsten zwart werk vergoeden [N...,"[(verzekeraar, ADJ), (inkomst, NOUN), (zwart, ..."
120,1,Verzekeraar wil klant helpen met zorgbemiddeli...,verzekeraar klant helpen zorgbemiddeling [NEWL...,Zorg,verzekeraar klant helpen zorgbemiddeling [NEWL...,"[(verzekeraar, ADJ), (klant, NOUN), (helpen, V..."
120,2,Verzekeraar wil wachtende patiënt aan snelle z...,verzekeraar wachten patiënt snel zorg helpen [...,Zorg,verzekeraar wachtende patiënt snelle zorg help...,"[(verzekeraar, NOUN), (wachten, VERB), (patiën..."


# POS-tags one-hot encoding & ngrams

In [4]:
def pos_to_features(pos_tags):
    """Convert list of (word, POS) tuples into a dictionary of POS tag counts."""
    pos_counts = Counter(tag for _, tag in pos_tags)
    return dict(pos_counts)

def pos_to_ngrams(pos_tags, n=2):
    """Convert a list of POS-tag tuples into n-gram strings."""
    pos_sequence = [tag for _, tag in pos_tags]  # Extract only POS tags
    ngrams = ['_'.join(pos_sequence[i:i+n]) for i in range(len(pos_sequence)-n+1)]
    return ' '.join(ngrams)  # Convert to space-separated string for TF-IDF

In [5]:
X = df[['text', 'pos_tags']].copy() # Testing on df

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["group"])

# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Split the data into text and POS-tags
X_train_text, X_train_pos = X_train['text'], X_train['pos_tags']
X_val_text, X_val_pos = X_val['text'], X_val['pos_tags']
X_test_text, X_test_pos = X_test['text'], X_test['pos_tags']

# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_text = vectorizer.fit_transform(X_train_text)
X_val_text = vectorizer.transform(X_val_text)
X_test_text = vectorizer.transform(X_test_text)

# Convert POS tags to feature dictionaries
X_train_pos_features = X_train_pos.apply(pos_to_features)
X_val_pos_features = X_val_pos.apply(pos_to_features)
X_test_pos_features = X_test_pos.apply(pos_to_features)

# Vectorize POS features (Fit on train, Transform on val/test)
pos_vectorizer = DictVectorizer(sparse=True)
X_train_pos = pos_vectorizer.fit_transform(X_train_pos_features)
X_val_pos = pos_vectorizer.transform(X_val_pos_features)
X_test_pos = pos_vectorizer.transform(X_test_pos_features)

# Scale POS features (Fit on train, Transform on val/test)
scaler = MaxAbsScaler()
X_train_pos = scaler.fit_transform(X_train_pos)
X_val_pos = scaler.transform(X_val_pos)
X_test_pos = scaler.transform(X_test_pos)

# Combine TF-IDF and POS tag features
X_train_combined = hstack([X_train_text, X_train_pos])
X_val_combined = hstack([X_val_text, X_val_pos])
X_test_combined = hstack([X_test_text, X_test_pos])

# Final shape check
print(f"Train shape: {X_train_combined.shape}, Validation shape: {X_val_combined.shape}, Test shape: {X_test_combined.shape}")

Train shape: (261, 6376), Validation shape: (56, 6376), Test shape: (57, 6376)


**Now let's test them on the data**

In [6]:
# Tune the depth of the Random Forest using the validation set
best_depth = None
best_score = 0
depths = [5, 10, 15, 20, 25, None]  # Different depths to test

for depth in depths:
    classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    classifier.fit(X_train_combined, y_train)
    val_score = classifier.score(X_val_combined, y_val)
    print(f"Depth: {depth}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_depth = depth

print(f"\nBest Depth: {best_depth}, Best Validation Score: {best_score}")

# Train the final model using the best depth
final_classifier = RandomForestClassifier(max_depth=best_depth, random_state=42)
final_classifier.fit(X_train_combined, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_combined)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Depth: 5, Validation Score: 0.5714285714285714
Depth: 10, Validation Score: 0.6607142857142857
Depth: 15, Validation Score: 0.6607142857142857
Depth: 20, Validation Score: 0.7142857142857143
Depth: 25, Validation Score: 0.6964285714285714
Depth: None, Validation Score: 0.6607142857142857

Best Depth: 20, Best Validation Score: 0.7142857142857143

Test Accuracy: 0.5614035087719298

Test Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.71      0.75        21
           1       0.45      0.59      0.51        17
           2       0.36      0.40      0.38        10
           3       0.60      0.33      0.43         9

    accuracy                           0.56        57
   macro avg       0.55      0.51      0.52        57
weighted avg       0.58      0.56      0.56        57



In [7]:
best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(X_train_combined, y_train)
        val_score = classifier.score(X_val_combined, y_val)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(X_train_combined, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_combined)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Kernel: linear, C: 0.1, Validation Score: 0.4642857142857143
Kernel: linear, C: 1, Validation Score: 0.7321428571428571
Kernel: linear, C: 10, Validation Score: 0.75
Kernel: rbf, C: 0.1, Validation Score: 0.42857142857142855
Kernel: rbf, C: 1, Validation Score: 0.5178571428571429
Kernel: rbf, C: 10, Validation Score: 0.6964285714285714
Kernel: poly, C: 0.1, Validation Score: 0.48214285714285715
Kernel: poly, C: 1, Validation Score: 0.5714285714285714
Kernel: poly, C: 10, Validation Score: 0.5892857142857143

Best Kernel: linear, Best C: 10, Best Validation Score: 0.75

Test Accuracy: 0.6666666666666666

Test Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.67      0.72        21
           1       0.55      0.71      0.62        17
           2       0.70      0.70      0.70        10
           3       0.71      0.56      0.63         9

    accuracy                           0.67        57
   macro avg       0.68      0.66 

In [8]:
#Testing only text

best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(X_train_text, y_train)
        val_score = classifier.score(X_val_text, y_val)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(X_train_text, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_text)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Kernel: linear, C: 0.1, Validation Score: 0.375
Kernel: linear, C: 1, Validation Score: 0.8035714285714286
Kernel: linear, C: 10, Validation Score: 0.7678571428571429
Kernel: rbf, C: 0.1, Validation Score: 0.375
Kernel: rbf, C: 1, Validation Score: 0.6785714285714286
Kernel: rbf, C: 10, Validation Score: 0.7142857142857143
Kernel: poly, C: 0.1, Validation Score: 0.375
Kernel: poly, C: 1, Validation Score: 0.6071428571428571
Kernel: poly, C: 10, Validation Score: 0.6428571428571429

Best Kernel: linear, Best C: 1, Best Validation Score: 0.8035714285714286

Test Accuracy: 0.7719298245614035

Test Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.76      0.82        21
           1       0.74      0.82      0.78        17
           2       0.62      0.80      0.70        10
           3       0.86      0.67      0.75         9

    accuracy                           0.77        57
   macro avg       0.77      0.76      0.76     

**Now we test for pos_tag ngrams rather than one-hot encodings:**

In [9]:
# Testing for ngrams

X = df[['text', 'pos_tags']].copy() # Testing on df

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["group"])

# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Split the data into text and POS-tags
X_train_text, X_train_pos = X_train['text'], X_train['pos_tags']
X_val_text, X_val_pos = X_val['text'], X_val['pos_tags']
X_test_text, X_test_pos = X_test['text'], X_test['pos_tags']


# Split the data into text and POS-tags and construct ngrams
X_train_text, X_train_pos = X_train['text'], X_train['pos_tags'].apply(lambda x: pos_to_ngrams(x, n=2))
X_val_text, X_val_pos = X_val['text'], X_val['pos_tags'].apply(lambda x: pos_to_ngrams(x, n=2))
X_test_text, X_test_pos = X_test['text'], X_test['pos_tags'].apply(lambda x: pos_to_ngrams(x, n=2))



# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_text = vectorizer.fit_transform(X_train_text)
X_val_text = vectorizer.transform(X_val_text)
X_test_text = vectorizer.transform(X_test_text)

# Convert POS n-grams to TF-IDF
pos_vectorizer = TfidfVectorizer()
X_train_pos = pos_vectorizer.fit_transform(X_train_pos)
X_val_pos = pos_vectorizer.transform(X_val_pos)
X_test_pos = pos_vectorizer.transform(X_test_pos)

# Scale POS features (Fit on train, Transform on val/test)
scaler = MaxAbsScaler()
X_train_pos = scaler.fit_transform(X_train_pos)
X_val_pos = scaler.transform(X_val_pos)
X_test_pos = scaler.transform(X_test_pos)

# Combine TF-IDF and POS tag features
X_train_combined = hstack([X_train_text, X_train_pos])
X_val_combined = hstack([X_val_text, X_val_pos])
X_test_combined = hstack([X_test_text, X_test_pos])

# Final shape check
print(f"Train shape: {X_train_combined.shape}, Validation shape: {X_val_combined.shape}, Test shape: {X_test_combined.shape}")

Train shape: (261, 6514), Validation shape: (56, 6514), Test shape: (57, 6514)


In [10]:
# Tune the depth of the Random Forest using the validation set
best_depth = None
best_score = 0
depths = [5, 10, 15, 20, 25, None]  # Different depths to test

for depth in depths:
    classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    classifier.fit(X_train_combined, y_train)
    val_score = classifier.score(X_val_combined, y_val)
    print(f"Depth: {depth}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_depth = depth

print(f"\nBest Depth: {best_depth}, Best Validation Score: {best_score}")

# Train the final model using the best depth
final_classifier = RandomForestClassifier(max_depth=best_depth, random_state=42)
final_classifier.fit(X_train_combined, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_combined)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Depth: 5, Validation Score: 0.5535714285714286
Depth: 10, Validation Score: 0.6071428571428571
Depth: 15, Validation Score: 0.6071428571428571
Depth: 20, Validation Score: 0.6071428571428571
Depth: 25, Validation Score: 0.625
Depth: None, Validation Score: 0.6428571428571429

Best Depth: None, Best Validation Score: 0.6428571428571429

Test Accuracy: 0.5964912280701754

Test Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.76      0.74        21
           1       0.52      0.65      0.58        17
           2       0.45      0.50      0.48        10
           3       0.67      0.22      0.33         9

    accuracy                           0.60        57
   macro avg       0.59      0.53      0.53        57
weighted avg       0.61      0.60      0.58        57



In [11]:
best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(X_train_combined, y_train)
        val_score = classifier.score(X_val_combined, y_val)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(X_train_combined, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_combined)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Kernel: linear, C: 0.1, Validation Score: 0.5178571428571429
Kernel: linear, C: 1, Validation Score: 0.5535714285714286
Kernel: linear, C: 10, Validation Score: 0.5714285714285714
Kernel: rbf, C: 0.1, Validation Score: 0.375
Kernel: rbf, C: 1, Validation Score: 0.5178571428571429
Kernel: rbf, C: 10, Validation Score: 0.5178571428571429
Kernel: poly, C: 0.1, Validation Score: 0.375
Kernel: poly, C: 1, Validation Score: 0.5178571428571429
Kernel: poly, C: 10, Validation Score: 0.5714285714285714

Best Kernel: linear, Best C: 10, Best Validation Score: 0.5714285714285714

Test Accuracy: 0.47368421052631576

Test Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.62      0.60        21
           1       0.47      0.47      0.47        17
           2       0.36      0.50      0.42        10
           3       0.25      0.11      0.15         9

    accuracy                           0.47        57
   macro avg       0.42      0.43

In [12]:
#Testing only text

best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(X_train_text, y_train)
        val_score = classifier.score(X_val_text, y_val)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(X_train_text, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_text)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Kernel: linear, C: 0.1, Validation Score: 0.375
Kernel: linear, C: 1, Validation Score: 0.8035714285714286
Kernel: linear, C: 10, Validation Score: 0.7678571428571429
Kernel: rbf, C: 0.1, Validation Score: 0.375
Kernel: rbf, C: 1, Validation Score: 0.6785714285714286
Kernel: rbf, C: 10, Validation Score: 0.7142857142857143
Kernel: poly, C: 0.1, Validation Score: 0.375
Kernel: poly, C: 1, Validation Score: 0.6071428571428571
Kernel: poly, C: 10, Validation Score: 0.6428571428571429

Best Kernel: linear, Best C: 1, Best Validation Score: 0.8035714285714286

Test Accuracy: 0.7719298245614035

Test Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.76      0.82        21
           1       0.74      0.82      0.78        17
           2       0.62      0.80      0.70        10
           3       0.86      0.67      0.75         9

    accuracy                           0.77        57
   macro avg       0.77      0.76      0.76     

# Model Functions

Since the process of tuning a classifier tends to not change much, we create a function for every type of classifier so that we can tune them without needing to re-write the code every time.

In [13]:
def tune_SVM(df: pd.DataFrame,
             testing_ratio: float = 0.15, 
             vectorization_within_folds: bool = False,
             k_values: list = [2, 3, 5, 10, 20], 
             embedding: str = "tf-idf",
             pos: str = "none", 
             n: list = [2,3,4,5]):
    """..."""

    # Tune the hyperparameters of the SVM classifier using stratified K-fold cross validation
    best_kernel = None
    best_C = None
    best_score = 0
    
    kernels = ['linear', 'rbf', 'poly']
    C_values = [0.1, 1, 10]    

    if pos in ["one-hot", "ngram"]:
        X = df[['text', 'pos_tags']].copy()
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(df["group"])

        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

        if pos == "one-hot":
            # Split the data into text and POS-tags
            X_train_text, X_train_pos = X_train['text'], X_train['pos_tags']
            X_val_text, X_val_pos = X_val['text'], X_val['pos_tags']
            X_test_text, X_test_pos = X_test['text'], X_test['pos_tags']

            # Convert POS tags to feature dictionaries
            X_train_pos_features = X_train_pos.apply(pos_to_features)
            X_val_pos_features = X_val_pos.apply(pos_to_features)
            X_test_pos_features = X_test_pos.apply(pos_to_features)
        else:
            # Split the data into text and POS-tags and construct ngrams
            X_train_text, X_train_pos = X_train['text'], X_train['pos_tags'].apply(lambda x: pos_to_ngrams(x, n=2))
            X_val_text, X_val_pos = X_val['text'], X_val['pos_tags'].apply(lambda x: pos_to_ngrams(x, n=2))
            X_test_text, X_test_pos = X_test['text'], X_test['pos_tags'].apply(lambda x: pos_to_ngrams(x, n=2))

            # Convert POS n-grams to TF-IDF
            pos_vectorizer = TfidfVectorizer()
            X_train_pos = pos_vectorizer.fit_transform(X_train_pos)
            X_val_pos = pos_vectorizer.transform(X_val_pos)
            X_test_pos = pos_vectorizer.transform(X_test_pos)

        # Convert text to TF-IDF representation
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_text = vectorizer.fit_transform(X_train_text)
        X_val_text = vectorizer.transform(X_val_text)
        X_test_text = vectorizer.transform(X_test_text)
        
        # Vectorize POS features (Fit on train, Transform on val/test)
        pos_vectorizer = DictVectorizer(sparse=True)
        X_train_pos = pos_vectorizer.fit_transform(X_train_pos_features)
        X_val_pos = pos_vectorizer.transform(X_val_pos_features)
        X_test_pos = pos_vectorizer.transform(X_test_pos_features)
        
        # Scale POS features (Fit on train, Transform on val/test)
        scaler = MaxAbsScaler()
        X_train_pos = scaler.fit_transform(X_train_pos)
        X_val_pos = scaler.transform(X_val_pos)
        X_test_pos = scaler.transform(X_test_pos)
        
        # Combine TF-IDF and POS tag features
        X_train = hstack([X_train_text, X_train_pos])
        X_val = hstack([X_val_text, X_val_pos])
        X_test = hstack([X_test_text, X_test_pos])


    else: # POS-tags are not used in the classification
        X = df['text']  # Feature: text column
        y = df['group']  # Label: group column
    
        # Split the data into training (85%) and test (15%) sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testing_ratio, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for C in C_values:
                for kernel in kernels:
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_tfidf, y_train)
                    y_pred = classifier.predict(X_val_tfidf)
                    mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                    print(f"K: {k}, C: {C}, Kernel: {kernel}, Validation Accuracy: {mean_score}")
                    
                    if mean_score > best_score:
                        best_score = mean_score
                        best_C = C
                        best_kernel = kernel
                        best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
            
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for C in C_values:
            for kernel in kernels:
                scores = []
    
                if vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train, y_train):
                        # Split the raw text data for the current fold
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
                    vectorizer = TfidfVectorizer(max_features=10000)
                    X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                    X_val_fold_tfidf = vectorizer.transform(X_val_fold)
    
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_fold_tfidf, y_train_fold)
                    y_pred = classifier.predict(X_val_fold_tfidf)
    
                if not vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train_tfidf, y_train):
                        X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                    
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_fold, y_train_fold)
                    y_pred = classifier.predict(X_val_fold)
                        
                scores.append(accuracy_score(y_val_fold, y_pred))
                
                mean_score = np.mean(scores)
                print(f"K: {k}, C: {C}, Kernel: {kernel}, StratifiedKFold Score: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_C = C
                    best_kernel = kernel
                    best_k = k
    
    print(f"\nBest K: {best_k}, Best C: {best_C}, Best kernel: {best_kernel}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

In [14]:
def tune_naive_bayes(df: pd.DataFrame,
                     testing_ratio: float = 0.15, 
                     vectorization_within_folds: bool = False,
                     k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the SVM classifier using stratified K-fold cross validation
    best_alpha = None
    best_fit_prior = None
    best_score = 0
    
    alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    fit_prior_values = [True, False]  

    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for alpha in alpha_values:
                for fit_prior_value in fit_prior_values:
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_tfidf, y_train)
                    y_pred = classifier.predict(X_val_tfidf)
                    mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                    print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, Validation Accuracy: {mean_score}")
                    
                    if mean_score > best_score:
                        best_score = mean_score
                        best_alpha = alpha
                        best_fit_prior = fit_prior_value
                        best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
            
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for alpha in alpha_values:
            for fit_prior_value in fit_prior_values:
                scores = []
    
                if vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train, y_train):
                        # Split the raw text data for the current fold
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
                    vectorizer = TfidfVectorizer(max_features=10000)
                    X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                    X_val_fold_tfidf = vectorizer.transform(X_val_fold)
    
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_fold_tfidf, y_train_fold)
                    y_pred = classifier.predict(X_val_fold_tfidf)
    
                if not vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train_tfidf, y_train):
                        X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                    
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_fold, y_train_fold)
                    y_pred = classifier.predict(X_val_fold)
                        
                scores.append(accuracy_score(y_val_fold, y_pred))
                
                mean_score = np.mean(scores)
                print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, StratifiedKFold Score: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_alpha = alpha
                    best_fit_prior = fit_prior_value
                    best_k = k
    
    print(f"\nBest K: {best_k}, Best alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

# Overarching Model Tuning Function

In [15]:
def tune_random_forest(df: pd.DataFrame,
                       testing_ratio: float = 0.15, 
                       vectorization_within_folds: bool = False,
                       k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the Random Forest using stratified K-fold cross validation
    best_depth = None
    best_score = 0
    depth_values = [5, 10, 15, 20, 25, None]  # Different depths to test
    # depth_values = [5, 10]  # For faster tests
    
    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for depth in depth_values:
                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_tfidf, y_train)
                y_pred = classifier.predict(X_val_tfidf)
                mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                print(f"K: {k}, Depth: {depth}, Validation Accuracy: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_depth = depth
                    best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
        
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for depth in depth_values:
            scores = []

            if vectorization_within_folds:
                for train_index, val_index in skf.split(X_train, y_train):
                    # Split the raw text data for the current fold
                    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                vectorizer = TfidfVectorizer(max_features=10000)
                X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                X_val_fold_tfidf = vectorizer.transform(X_val_fold)

                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_fold_tfidf, y_train_fold)
                y_pred = classifier.predict(X_val_fold_tfidf)

            if not vectorization_within_folds:
                for train_index, val_index in skf.split(X_train_tfidf, y_train):
                    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                
                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_fold, y_train_fold)
                y_pred = classifier.predict(X_val_fold)
                    
            scores.append(accuracy_score(y_val_fold, y_pred))
            
            mean_score = np.mean(scores)
            print(f"K: {k}, Depth: {depth}, StratifiedKFold Score: {mean_score}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_depth = depth
                best_k = k
    
    print(f"\nBest K: {best_k}, Best depth: {best_depth}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

Expected parameter values: <br>
> - _classifier_: Selects the type of classifier from amongst the following: ["SVM", "NB", "RF"].
> - _embedding_:
> - _pos_:
> - _n_:
> > Note that if _pos_$\neq$"ngram", n is not used and its value is irrelevant.
> - _testing_ratio_: The ratio of the data that is reserved for testing. Any floating point in the inclusive interval [0, 1].
> > Note that if $k=1$, the size of the validation set is assumed to be equal to the size of the testing set, specified by _testing_ratio_.
> - _vectorization_within_folds_: Would you like to vectorize each individual fold rather than vectorizing the entire training set once? [True, False].
> - _show_class_accuracy_: Would you like the accuracy per class to be displayed? [True, False].
> - _show_confusion_matrix_: Would you like the resulting confusion matrix to be displayed? [True, False].
> - _k_values_: All values of k which are tested for stratified k-fold cross validation. Any list containing only positive integers.
> - 

In [16]:
def train_model(df: pd.DataFrame,
                model_type: str = "SVM", 
                embedding: str = "tf-idf",
                pos: str = "none",
                n: list = [2,3,4,5],
                testing_ratio: float = 0.15, 
                vectorization_within_folds: bool = False, 
                show_class_accuracy: bool = True, 
                show_confusion_matrix: bool = True,
                k_values: list = [2, 3, 5, 10, 20],):
    """..."""

    print(f"Tuning {model_type} classifier with a train/test split of {1-testing_ratio}/{testing_ratio} \n")
    
    #Raise appropriate error message in case of a faulty parameter value
    if not isinstance(df, pd.DataFrame):
        raise ValueError(f"Invalid input data. Please ensure df is a Pandas DataFrame")
    if model_type not in ["SVM", "NB", "RF"]:
        raise ValueError(f"Invalid model_type. Choose from {'SVM', 'NB', 'RF'}")
    if testing_ratio < 0 or testing_ratio > 1:
        raise ValueError(f"Invalid testing ratio. Choose a value in the inclusive interval [0,1]")
    if type(vectorization_within_folds) != bool:
        raise ValueError(f"Invalid vectorization_within_folds value. Please ensure vectorization_within_folds is boolean")
    if type(show_class_accuracy) != bool:
        raise ValueError(f"Invalid show_class_accuracy value. Please ensure show_class_accuracy is boolean")
    if type(show_confusion_matrix) != bool:
        raise ValueError(f"Invalid show_confusion_matrix value. Please ensure show_confusion_matrix is boolean")
    if not all(isinstance(x, int) and x > 0 for x in k_values):
        raise ValueError(f"Invalid k_values. Please ensure all entries in k_values are positive integers")
    if 1 in k_values and vectorization_within_folds:
        raise ValueError(f"If k_values contains 1, vectorization_within_folds must be False since k=1 implies standard hold-out cross-validation, for which vectorization_within_folds must be False")
    if embedding not in ["tf-idf", "BERTje"]:
        raise ValueError(f'Invalid embedding. Choose from ["tf-idf", "BERTje"]')
    if pos not in ["none", "one-hot", "ngram"]:
        raise ValueError(f'Invalid pos. Choose from ["none", "one-hot", "ngram"]')
    if type(n) != list or any(not isinstance(x, int) or x < 2 for x in n):
        raise ValueError(f"Invalid n value. Please ensure n is a list of integers that are no smaller than 2")

    model_mapping = {"SVM": SVC, "NB": MultinomialNB , "RF": RandomForestClassifier}
    ModelClass = model_mapping[model_type]
    
    #Best classifier performance and number of stratified folds so far
    best_score = 0
    best_k = None

    #RF hyperparameters:
    best_depth = None
    depth_values = [5, 10, 15, 20, 25, None]

    #SVM hyperparameters:
    best_kernel = None
    best_C = None
    kernel_values = ['linear', 'rbf', 'poly']
    C_values = [0.1, 1, 10]

    #NB hyperparameters:
    best_alpha = None
    best_fit_prior = None
    alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    fit_prior_values = [True, False]
    
    if model_type == "SVM":
        results = tune_SVM(df=df, testing_ratio=testing_ratio, 
                           vectorization_within_folds=vectorization_within_folds, 
                           k_values=k_values, 
                           embedding=embedding, 
                           pos=pos, 
                           n=n)
        classifier, y_pred = results[0], results[3]

    if model_type == "NB":
        results = tune_naive_bayes(df=df, 
                                   testing_ratio=testing_ratio, 
                                   vectorization_within_folds=vectorization_within_folds, 
                                   k_values=k_values,
                                   embedding=embedding,
                                   pos=pos,
                                   n=n)
        classifier, y_pred = results[0], results[3]

    if model_type == "RF":
        results = tune_random_forest(df=df, 
                                     testing_ratio=testing_ratio, 
                                     vectorization_within_folds=vectorization_within_folds, 
                                     k_values=k_values, 
                                     embedding=embedding,
                                     pos=pos,
                                     n=n)
        classifier, y_pred = results[0], results[3]
    
    if show_class_accuracy:
        cm = confusion_matrix(y_test, y_pred)
        
        # Class names (assuming they are in the same order as in y_train or y_test)
        class_names = np.unique(y_test)  # This will give you the unique class labels
        
        # Calculate per-class accuracy: TP / (TP + FN)
        class_accuracies = cm.diagonal() / cm.sum(axis=1)
        
        # Print the accuracy for each class along with its name
        for i, acc in enumerate(class_accuracies):
            print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

    if show_confusion_matrix:
        cm = confusion_matrix(y_test, y_pred)
        class_names = np.unique(y_test)
        
        # Plotting the confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)
        
        # Label the axes
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title('Confusion Matrix')
        
        # Display the plot
        plt.show()

# Vector Transformation

For now, I only test TF-IDF

In [17]:
# Extract features and labels
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

NameError: name 'df_clean' is not defined

In [None]:
# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
X_train_tfidf

In [None]:
X_val_tfidf

In [None]:
X_test_tfidf

**Now I test BERTje Embeddings**

In [None]:
df_clean_bertje = df_clean.copy()

# Load BERTje tokenizer and model
MODEL_NAME = "GroNLP/bert-base-dutch-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
# Encode labels (convert string labels to integers)
label_mapping = {label: idx for idx, label in enumerate(df_clean_bertje["group"].unique())}
df_clean_bertje["label"] = df_clean_bertje["group"].map(label_mapping)

# Split dataset
X_train_bertje, X_temp_bertje, y_train_bertje, y_temp_bertje = train_test_split(
    df_clean_bertje["text"].tolist(), df_clean_bertje["label"].tolist(), test_size=0.3, random_state=42)
X_val_bertje, X_test_bertje, y_val_bertje, y_test_bertje = train_test_split(X_temp_bertje, y_temp_bertje, test_size=0.5, random_state=42)

In [None]:
def get_bert_embedding(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation

# Convert text data into embeddings
train_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_train_bertje]).numpy()
val_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_val_bertje]).numpy()
test_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_test_bertje]).numpy()

**Now I test Word2Vec Embeddings**

# Random Forest Classifier

In [None]:
# Tune the depth of the Random Forest using the validation set
best_depth = None
best_score = 0
depths = [5, 10, 15, 20, 25, None]  # Different depths to test

for depth in depths:
    classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    classifier.fit(X_train_tfidf, y_train)
    val_score = classifier.score(X_val_tfidf, y_val)
    print(f"Depth: {depth}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_depth = depth

print(f"\nBest Depth: {best_depth}, Best Validation Score: {best_score}")

# Train the final model using the best depth
final_classifier = RandomForestClassifier(max_depth=best_depth, random_state=42)
final_classifier.fit(X_train_tfidf, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_tfidf)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Now we show the accuracy per class and visualize them as a confusion matrix

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

# Support Vector Machine

In [None]:
# Tune the SVM hyperparameters using the validation set
best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(X_train_tfidf, y_train)
        val_score = classifier.score(X_val_tfidf, y_val)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(X_train_tfidf, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_tfidf)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Now we show the accuracy per class and visualize them as a confusion matrix

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

# Naive Bayes

In [None]:
# Tune the Naive Bayes hyperparameters using the validation set
best_alpha = None
best_fit_prior = None
best_score = float('-inf')

# Test different hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]

for fit_prior_value in fit_prior_values:
    for alpha in alpha_values:
        classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
        classifier.fit(X_train_tfidf, y_train)
        val_score = classifier.score(X_val_tfidf, y_val)
        print(f"Alpha: {alpha}, fit_prior: {fit_prior_value}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_alpha = alpha
            best_fit_prior = fit_prior_value

print(f"\nBest alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best Validation Score: {best_score}")

# Train the final model using the best alpha and fit_prior
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_tfidf)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Now we show the accuracy per class and visualize them as a confusion matrix

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

# Leave-one-out cross validation

In [None]:
# Extract features and labels
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

# Split the data into training (85%) and test (15%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize Leave-One-Out Cross-Validation
loo = LeaveOneOut()

# Hyperparameter tuning with LOO-CV
best_alpha = None
best_fit_prior = None
best_score = float('-inf')

# Define hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]

# Try different hyperparameter combinations
for fit_prior_value in fit_prior_values:
    for alpha in alpha_values:
        scores = []
        
        for train_index, val_index in loo.split(X_train_tfidf):
            X_train_cv, X_val = X_train_tfidf[train_index], X_train_tfidf[val_index]
            y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
            
            classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
            classifier.fit(X_train_cv, y_train_cv)
            
            y_pred = classifier.predict(X_val)
            scores.append(accuracy_score(y_val, y_pred))
        
        mean_score = np.mean(scores)
        print(f"Alpha: {alpha}, fit_prior: {fit_prior_value}, LOO-CV Score: {mean_score}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_alpha = alpha
            best_fit_prior = fit_prior_value

print(f"\nBest alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best LOO-CV Score: {best_score}")

# Train the final model using the best hyperparameters
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Evaluate on the test set
y_pred = final_classifier.predict(X_test_tfidf)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

In [None]:
# This leads to worse performance. Likely since Leave-one-out cross validation tends to create High-Variance Models.
# Instead, I will use stratified K-fold cross validation. K will be treated as a hyperparameter.

In [None]:
# Extract features and labels
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

# Split the data into training (85%) and test (15%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Hyperparameter tuning with Stratified K-Fold CV
best_alpha = None
best_fit_prior = None
best_k = None
best_score = float('-inf')

# Define hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]
k_values = [2, 3, 5, 10, 20]  # Different values for K in StratifiedKFold

# Try different hyperparameter combinations
for k in k_values:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    for fit_prior_value in fit_prior_values:
        for alpha in alpha_values:
            scores = []
            
            for train_index, val_index in skf.split(X_train_tfidf, y_train):
                X_train_cv, X_val = X_train_tfidf[train_index], X_train_tfidf[val_index]
                y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
                
                classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                classifier.fit(X_train_cv, y_train_cv)
                
                y_pred = classifier.predict(X_val)
                scores.append(accuracy_score(y_val, y_pred))
            
            mean_score = np.mean(scores)
            print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, StratifiedKFold Score: {mean_score}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_alpha = alpha
                best_fit_prior = fit_prior_value
                best_k = k

print(f"\nBest K: {best_k}, Best alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best StratifiedKFold Score: {best_score}")

# Train the final model using the best hyperparameters
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Evaluate on the test set
y_pred = final_classifier.predict(X_test_tfidf)
y_pred_prob = final_classifier.predict_proba(X_test_tfidf)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
y_pred_prob

for row in y_pred_prob:
    formatted_row = ["{:.4f}".format(val) for val in row]
    print(formatted_row)

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

In [None]:
#Note, need to ensure TF-IDF vectorization happens within each fold to prevent leakage.

# TF-IDF vectorization within folds to avoid data leakage

Currently, vectorizations occurs over the full training set. <br>
However, we train K models, 1 for each fold. <br>
This means that for each fold, vectorizations should occur for the training data for that specific fold. <br>
This avoids data leakage from our validation set to our training set. <br>
Note that this is not strictly needed (since not separately within each fold is usually acceptable), but it should slightly improve performance at the cost of additional runtime.

In [None]:
# Assume df_clean is already defined with 'text' and 'group' columns
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

# Split the data into training (85%) and test (15%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Hyperparameter tuning with Stratified K-Fold CV
best_alpha = None
best_fit_prior = None
best_k = None
best_score = float('-inf')

# Define hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]
k_values = [2, 3, 5, 10, 20]  # Different values for K in StratifiedKFold

# Try different hyperparameter combinations
for k in k_values:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    for fit_prior_value in fit_prior_values:
        for alpha in alpha_values:
            scores = []
            
            for train_index, val_index in skf.split(X_train, y_train):
                # Split the raw text data for the current fold
                X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                
                # Vectorize within the fold: fit on training fold, transform validation fold
                vectorizer = TfidfVectorizer(max_features=10000)
                X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                X_val_fold_tfidf = vectorizer.transform(X_val_fold)
                
                # Initialize and train the classifier
                classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                classifier.fit(X_train_fold_tfidf, y_train_fold)
                
                # Validate the model
                y_pred = classifier.predict(X_val_fold_tfidf)
                scores.append(accuracy_score(y_val_fold, y_pred))
            
            mean_score = np.mean(scores)
            print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, StratifiedKFold Score: {mean_score}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_alpha = alpha
                best_fit_prior = fit_prior_value
                best_k = k

print(f"\nBest K: {best_k}, Best alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best StratifiedKFold Score: {best_score}")

# Final model training on the entire training set using the best hyperparameters
# Here, we fit the vectorizer on the full training set
final_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = final_vectorizer.fit_transform(X_train)
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Transform the test set using the vectorizer fitted on the entire training set
X_test_tfidf = final_vectorizer.transform(X_test)
y_pred = final_classifier.predict(X_test_tfidf)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

# Turning model tuning into function

Since the process of tuning a classifier tends to not change much, we create a function for every type of classifier so that we can tune them without needing to re-write the code every time.

In [None]:
def tune_random_forest(df: pd.DataFrame,
                       testing_ratio: float = 0.15, 
                       vectorization_within_folds: bool = False,
                       k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the Random Forest using stratified K-fold cross validation
    best_depth = None
    best_score = 0
    depth_values = [5, 10, 15, 20, 25, None]  # Different depths to test
    # depth_values = [5, 10]  # For faster tests
    
    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for depth in depth_values:
                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_tfidf, y_train)
                y_pred = classifier.predict(X_val_tfidf)
                mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                print(f"K: {k}, Depth: {depth}, Validation Accuracy: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_depth = depth
                    best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
        
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for depth in depth_values:
            scores = []

            if vectorization_within_folds:
                for train_index, val_index in skf.split(X_train, y_train):
                    # Split the raw text data for the current fold
                    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                vectorizer = TfidfVectorizer(max_features=10000)
                X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                X_val_fold_tfidf = vectorizer.transform(X_val_fold)

                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_fold_tfidf, y_train_fold)
                y_pred = classifier.predict(X_val_fold_tfidf)

            if not vectorization_within_folds:
                for train_index, val_index in skf.split(X_train_tfidf, y_train):
                    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                
                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_fold, y_train_fold)
                y_pred = classifier.predict(X_val_fold)
                    
            scores.append(accuracy_score(y_val_fold, y_pred))
            
            mean_score = np.mean(scores)
            print(f"K: {k}, Depth: {depth}, StratifiedKFold Score: {mean_score}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_depth = depth
                best_k = k
    
    print(f"\nBest K: {best_k}, Best depth: {best_depth}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

In [None]:
def tune_SVM(df: pd.DataFrame,
             testing_ratio: float = 0.15, 
             vectorization_within_folds: bool = False,
             k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the SVM classifier using stratified K-fold cross validation
    best_kernel = None
    best_C = None
    best_score = 0
    
    kernels = ['linear', 'rbf', 'poly']
    C_values = [0.1, 1, 10]    

    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for C in C_values:
                for kernel in kernels:
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_tfidf, y_train)
                    y_pred = classifier.predict(X_val_tfidf)
                    mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                    print(f"K: {k}, C: {C}, Kernel: {kernel}, Validation Accuracy: {mean_score}")
                    
                    if mean_score > best_score:
                        best_score = mean_score
                        best_C = C
                        best_kernel = kernel
                        best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
            
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for C in C_values:
            for kernel in kernels:
                scores = []
    
                if vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train, y_train):
                        # Split the raw text data for the current fold
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
                    vectorizer = TfidfVectorizer(max_features=10000)
                    X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                    X_val_fold_tfidf = vectorizer.transform(X_val_fold)
    
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_fold_tfidf, y_train_fold)
                    y_pred = classifier.predict(X_val_fold_tfidf)
    
                if not vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train_tfidf, y_train):
                        X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                    
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_fold, y_train_fold)
                    y_pred = classifier.predict(X_val_fold)
                        
                scores.append(accuracy_score(y_val_fold, y_pred))
                
                mean_score = np.mean(scores)
                print(f"K: {k}, C: {C}, Kernel: {kernel}, StratifiedKFold Score: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_C = C
                    best_kernel = kernel
                    best_k = k
    
    print(f"\nBest K: {best_k}, Best C: {best_C}, Best kernel: {best_kernel}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

In [None]:
def tune_naive_bayes(df: pd.DataFrame,
                     testing_ratio: float = 0.15, 
                     vectorization_within_folds: bool = False,
                     k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the SVM classifier using stratified K-fold cross validation
    best_alpha = None
    best_fit_prior = None
    best_score = 0
    
    alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    fit_prior_values = [True, False]  

    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for alpha in alpha_values:
                for fit_prior_value in fit_prior_values:
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_tfidf, y_train)
                    y_pred = classifier.predict(X_val_tfidf)
                    mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                    print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, Validation Accuracy: {mean_score}")
                    
                    if mean_score > best_score:
                        best_score = mean_score
                        best_alpha = alpha
                        best_fit_prior = fit_prior_value
                        best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
            
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for alpha in alpha_values:
            for fit_prior_value in fit_prior_values:
                scores = []
    
                if vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train, y_train):
                        # Split the raw text data for the current fold
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
                    vectorizer = TfidfVectorizer(max_features=10000)
                    X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                    X_val_fold_tfidf = vectorizer.transform(X_val_fold)
    
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_fold_tfidf, y_train_fold)
                    y_pred = classifier.predict(X_val_fold_tfidf)
    
                if not vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train_tfidf, y_train):
                        X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                    
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_fold, y_train_fold)
                    y_pred = classifier.predict(X_val_fold)
                        
                scores.append(accuracy_score(y_val_fold, y_pred))
                
                mean_score = np.mean(scores)
                print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, StratifiedKFold Score: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_alpha = alpha
                    best_fit_prior = fit_prior_value
                    best_k = k
    
    print(f"\nBest K: {best_k}, Best alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

# Overarching Model Selection Function

Expected parameter values: <br>
> - _classifier_: Selects the type of classifier from amongst the following: ["SVM", "NB", "RF"].
> - _testing_ratio_: The ratio of the data that is reserved for testing. Any floating point in the inclusive interval [0, 1].
> > Note that if $k=1$, the size of the validation set is assumed to be equal to the size of the testing set, specified by _testing_ratio_.
> - _vectorization_within_folds_: Would you like to vectorize each individual fold rather than vectorizing the entire training set once? [True, False].
> - _show_class_accuracy_: Would you like the accuracy per class to be displayed? [True, False].
> - _show_confusion_matrix_: Would you like the resulting confusion matrix to be displayed? [True, False].
> - _k_values_: All values of k which are tested for stratified k-fold cross validation. Any list containing only positive integers.

In [None]:
def train_model(df: pd.DataFrame,
                model_type: str = "SVM", 
                testing_ratio: float = 0.15, 
                vectorization_within_folds: bool = False, 
                show_class_accuracy: bool = True, 
                show_confusion_matrix: bool = True,
                k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    print(f"Tuning {model_type} classifier with a train/test split of {1-testing_ratio}/{testing_ratio} \n")
    
    #Raise appropriate error message in case of a faulty parameter value
    if not isinstance(df, pd.DataFrame):
        raise ValueError(f"Invalid input data. Please ensure df is a Pandas DataFrame")
    if model_type not in ["SVM", "NB", "RF"]:
        raise ValueError(f"Invalid model_type. Choose from {'SVM', 'NB', 'RF'}")
    if testing_ratio < 0 or testing_ratio > 1:
        raise ValueError(f"Invalid testing ratio. Choose a value in the inclusive interval [0,1]")
    if type(vectorization_within_folds) != bool:
        raise ValueError(f"Invalid vectorization_within_folds value. Please ensure vectorization_within_folds is boolean")
    if type(show_class_accuracy) != bool:
        raise ValueError(f"Invalid show_class_accuracy value. Please ensure show_class_accuracy is boolean")
    if type(show_confusion_matrix) != bool:
        raise ValueError(f"Invalid show_confusion_matrix value. Please ensure show_confusion_matrix is boolean")
    if not all(isinstance(x, int) and x > 0 for x in k_values):
        raise ValueError(f"Invalid k_values. Please ensure all entries in k_values are positive integers")
    if 1 in k_values and vectorization_within_folds:
        raise ValueError(f"If k_values contains 1, vectorization_within_folds must be False since k=1 implies standard hold-out cross-validation, for which vectorization_within_folds must be False")

    model_mapping = {"SVM": SVC, "NB": MultinomialNB , "RF": RandomForestClassifier}
    ModelClass = model_mapping[model_type]
    
    #Best classifier performance and number of stratified folds so far
    best_score = 0
    best_k = None

    #RF hyperparameters:
    best_depth = None
    depth_values = [5, 10, 15, 20, 25, None]

    #SVM hyperparameters:
    best_kernel = None
    best_C = None
    kernel_values = ['linear', 'rbf', 'poly']
    C_values = [0.1, 1, 10]

    #NB hyperparameters:
    best_alpha = None
    best_fit_prior = None
    alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    fit_prior_values = [True, False]
    
    if model_type == "SVM":
        results = tune_SVM(df=df, testing_ratio=testing_ratio, vectorization_within_folds=vectorization_within_folds, k_values=k_values)
        classifier, y_pred = results[0], results[3]

    if model_type == "NB":
        results = tune_naive_bayes(df=df, testing_ratio=testing_ratio, vectorization_within_folds=vectorization_within_folds, k_values=k_values)
        classifier, y_pred = results[0], results[3]

    if model_type == "RF":
        results = tune_random_forest(df=df, testing_ratio=testing_ratio, vectorization_within_folds=vectorization_within_folds, k_values=k_values)
        classifier, y_pred = results[0], results[3]
    
    if show_class_accuracy:
        cm = confusion_matrix(y_test, y_pred)
        
        # Class names (assuming they are in the same order as in y_train or y_test)
        class_names = np.unique(y_test)  # This will give you the unique class labels
        
        # Calculate per-class accuracy: TP / (TP + FN)
        class_accuracies = cm.diagonal() / cm.sum(axis=1)
        
        # Print the accuracy for each class along with its name
        for i, acc in enumerate(class_accuracies):
            print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

    if show_confusion_matrix:
        cm = confusion_matrix(y_test, y_pred)
        class_names = np.unique(y_test)
        
        # Plotting the confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)
        
        # Label the axes
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title('Confusion Matrix')
        
        # Display the plot
        plt.show()

# Testing Function

The code below serves only to test the _train_model_ function and to detect and remove bugs. <br>
The specific parameter values hold no significance.

In [None]:
train_model(df=df_clean,
            model_type = "RF", 
            testing_ratio = 0.15, 
            vectorization_within_folds = False, 
            show_class_accuracy = True, 
            show_confusion_matrix = True)

In [None]:
train_model(df=df_clean,
            model_type = "SVM", 
            testing_ratio = 0.15, 
            vectorization_within_folds = False, 
            show_class_accuracy = True, 
            show_confusion_matrix = True,
            k_values = [1,2,5,10,20])

In [None]:
train_model(df=df_clean,
            model_type = "NB", 
            testing_ratio = 0.2, 
            vectorization_within_folds = True, 
            show_class_accuracy = True, 
            show_confusion_matrix = True,
           k_values = [2,12,15])

# Showing percentages per class

First for SVM classification

Now for Naive Bayes

In [None]:
#NOTE: i have noticed that test performance tends to be higher for k>1 even though the validation score would suggest that k=1 is best. 
#I think this is because stratified cross validation generalizes better.
#How do I decide on k? I can't run everything over test set, that would turn test set into 2nd validation set.

# Testing BERTje word embeddings

**NOTE**: These are just basic tests to see if the word embeddings hold potential. <br>
The option to use word embeddings should be added to the train_model function above

First we test random forests

In [None]:
# Tune the depth of the Random Forest using the validation set
best_depth = None
best_score = 0
depths = [5, 10, 15, 20, 25, None]  # Different depths to test

for depth in depths:
    classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    classifier.fit(train_embeddings, y_train_bertje)
    val_score = classifier.score(val_embeddings, y_val_bertje)
    print(f"Depth: {depth}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_depth = depth

print(f"\nBest Depth: {best_depth}, Best Validation Score: {best_score}")

# Train the final model using the best depth
final_classifier = RandomForestClassifier(max_depth=best_depth, random_state=42)
final_classifier.fit(train_embeddings, y_train_bertje)

# Test the model
y_pred = final_classifier.predict(test_embeddings)
print("\nTest Accuracy:", accuracy_score(y_test_bertje, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test_bertje, y_pred))

Now we Test SVM:

In [None]:
best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(train_embeddings, y_train_bertje)
        val_score = classifier.score(val_embeddings, y_val_bertje)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(train_embeddings, y_train_bertje)

# Test the model
y_pred = final_classifier.predict(test_embeddings)
print("\nTest Accuracy:", accuracy_score(y_test_bertje, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test_bertje, y_pred))

Now we test Naive Bayes (We test GaussianNB, since MultinomialNB does not work for continuous features)

In [None]:
best_var_smoothing = None
best_score = float('-inf')

# Test different hyperparameter values
var_smoothing_values = [10**-11, 10**-10, 10**-9, 10**-8, 10**-7]

for var_smoothing in var_smoothing_values:
    classifier = GaussianNB(var_smoothing=var_smoothing)
    classifier.fit(train_embeddings, y_train_bertje)
    val_score = classifier.score(val_embeddings, y_val_bertje)
    print(f"Var_smoothing: {var_smoothing}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_var_smoothing = var_smoothing

print(f"\nBest var_smoothing: {best_var_smoothing}, Best Validation Score: {best_score}")

# Train the final model using the best alpha and fit_prior
final_classifier = GaussianNB(var_smoothing=best_var_smoothing)
final_classifier.fit(train_embeddings, y_train_bertje)

# Test the model
y_pred = final_classifier.predict(test_embeddings)
print("\nTest Accuracy:", accuracy_score(y_test_bertje, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test_bertje, y_pred))

Surprisingly, these models all perform worse than their tf-idf counterparts. <br>
**TO DO**: Rename train_embeddings, val_embeddings, test_embeddings to X_train_bertje, X_val_bertje, X_test_bertje

**Idea**: Once pos_tags are used for classification, test and compare performance for:
> - Bag of Words <br>
> - TF-IDF <br>
> - BERTje <br>
> - mBERT (Multilingual BERT) <br>
> - RobBERT (Dutch RoBERTa model)
> - Word2Vec