# Imports

In [1]:
# !pip install PyMuPDF 
# !pip install transformers torch
# !pip install --upgrade ipywidgets

In [89]:
import fitz  # PyMuPDF
import torch
import logging
import spacy
import os
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split, LeaveOneOut, StratifiedKFold
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from scipy.sparse import hstack
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, TFAutoModel
from collections import Counter

In [3]:
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Loading Text From PDF File

In [4]:
pdf_path = "test article, zakelijke dienstverlening\\test_article_manually_extracted.pdf"
text = extract_text_from_pdf(pdf_path)
print(text)

Ook bij kredietverlening aan het mkb zijn bankpraktijken al jaren 
onacceptabel, en andere lezersreacties 
De lezers van het FD reageren deze week op het discutabele handelen van banken, hoe ‘Made in 
Germany’ verdwijnt, de bittere bijsmaak van graan en een goed getimed rapport. 
 
Vermeende nutsfunctie 
De hoogleraren Arnoud Boot en Harald Benink vragen terecht aandacht voor het discutabele 
optreden van banken (FD, 25 september). Ook bij de kredietverlening aan het mkb zijn bankpraktijken 
al jaren maatschappelijk onacceptabel. Zij verlenen geen krediet meer aan bepaalde sectoren, het 
serviceniveau daalde sterk, de adviesfunctie ontbreekt en zij vragen enorme zekerheden. 
Terwijl de banken zelf zo min mogelijk eigen vermogen willen aanhouden, worden bij bestaande 
klanten de duimschroeven snel aangedraaid als de liquiditeit krapper wordt. Niet omdat banken risico 
dragen, want er zijn immers panden, voorraden en andere tegoeden. Het draait louter om het 
winstgedreven verdienmodel v

# Dataset Construction

In [5]:
directory = 'data'
df = pd.DataFrame(columns=['article_id' , 'paragraph_id', 'text', 'group', 'publication_date'])

print('Please ensure that only pdf files of articles are present in the subfolders of the specified directory')
article_nr = 1
for folder in os.listdir(directory):
    folder_size = len(os.listdir(directory + "\\" + folder))
    print(f'{folder_size} article(s) detected in {folder} folder')
    
    for article in os.listdir(directory + '\\' + folder):
        text = extract_text_from_pdf(directory + '\\' + folder + '\\' + article)
        date = article.split(' ')[-1].split('.')[0] #Remove the article number and ".pdf" to obtain the publication date
        
        paragraphs = [para.strip() for para in text.split("\n \n") if para.strip()]
        para_nr = 1
        for para in paragraphs:
            df_temp = pd.DataFrame([[article_nr, para_nr, para, folder, date]], 
                                   columns=['article_id' , 'paragraph_id', 'text', 'group', 'publication_date'])
            df = pd.concat([df, df_temp])
            para_nr += 1
        article_nr += 1
        
df.set_index(['article_id' , 'paragraph_id'], inplace=True)
df['publication_date'] = pd.to_datetime(df['publication_date'], format='%d-%m-%Y')

Please ensure that only pdf files of articles are present in the subfolders of the specified directory
26 article(s) detected in Bouw & Vastgoed folder
37 article(s) detected in Handel & Industrie folder
37 article(s) detected in Zakelijke Dienstverlening folder
20 article(s) detected in Zorg folder


In [6]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,text,group,publication_date
article_id,paragraph_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,Provincies willen aan de slag met versoepeling...,Bouw & Vastgoed,2024-08-28
1,2,Het draait allemaal om de drempelwaarde voor e...,Bouw & Vastgoed,2024-08-28
1,3,Met een hogere drempelwaarde zouden minder ver...,Bouw & Vastgoed,2024-08-28
1,4,In het hoofdlijnenakkoord hebben de vier coali...,Bouw & Vastgoed,2024-08-28
1,5,De ondergrens is al langer onderwerp van discu...,Bouw & Vastgoed,2024-08-28
...,...,...,...,...
118,2,Telgenkamp vestigt haar hoop voor de korte ter...,Zorg,2024-10-08
119,1,Waarom verzekeraars inkomsten uit zwart werk w...,Zorg,2024-10-17
120,1,Verzekeraar wil klant helpen met zorgbemiddeli...,Zorg,2024-10-16
120,2,Verzekeraar wil wachtende patiënt aan snelle z...,Zorg,2024-10-16


In [7]:
for group in df['group'].unique():
    print(f"{len(df[df['group'] == group])} paragraphs in {group}.")

124 paragraphs in Bouw & Vastgoed.
114 paragraphs in Handel & Industrie.
83 paragraphs in Zakelijke Dienstverlening.
53 paragraphs in Zorg.


# Text pre-processing

Firstly, we load the nl_core_news_sm model and specify that [NEWLINE] should be treated as a single token.

In [8]:
# Download the Dutch POS model by uncommenting the line below

# !python -m spacy download nl_core_news_sm

In [9]:
# Load the 'nl_core_news_sm' model
nlp = spacy.load('nl_core_news_sm')

# Add [NEWLINE] as a single token so that it is not split into 3 seperate tokens
special_cases = {"[NEWLINE]": [{"ORTH": "[NEWLINE]"}]}
nlp.tokenizer.add_special_case("[NEWLINE]", [{"ORTH": "[NEWLINE]"}])

A pre-processed dataset df_clean is constructed out of df:

In [10]:
df_clean = df.copy()
df_clean['original_text'] = df_clean['text'].copy()
df_clean = df_clean[['original_text', 'text', 'group']]

**Case Normalization**: <br>
- Lowercasing
- Replacing \n with '[NEWLINE] ' 
- Removing duplicate spaces

In [11]:
def case_normalization(text):
    """Returns string of input containing only lowercase letters apart from [NEWLINE], which replaces \n"""
    text = text.lower()
    text = text.replace('\n', ' [NEWLINE] ')
    while text != text.replace('  ', ' '):
        text = text.replace('  ', ' ')
    return text

df_clean['text'] = df_clean['text'].apply(case_normalization)

**Punctuation Removal**

In [12]:
def remove_punctuation(text):
    """Returns the input text with all punctuation removed"""
    
    text = text.translate(text.maketrans("", "", string.punctuation))
    text = text.replace("NEWLINE", "[NEWLINE]")
    return text

df_clean['text'] = df_clean['text'].apply(remove_punctuation)

**Stop Word Removal**

Remove words that do not add semantic meaning to the text

In [13]:
# Sample text
text = "De snelle bruine vos springt over de luie hond."

# Process the text using spaCy
doc = nlp(text)

# Filter out stopwords
filtered_words = [token.text for token in doc if not token.is_stop]

# Join the filtered words back into a single string
text = " ".join(filtered_words)

# Print the result
print(text)

snelle bruine vos springt luie hond .


In [14]:
def remove_stopwords(text):
    """Returns string of input text with stopwords removed"""
    
    doc = nlp(text)
    filtered_words = [token.text for token in doc if not token.is_stop]
    text = " ".join(filtered_words)
    return text
    
    
# nlp = spacy.load("nl_core_news_sm")
df_clean['text'] = df_clean['text'].apply(remove_stopwords)

**POS Tagging**. <br>
<br>
There are 2 types of POS tagging: <br>
- Rule-based POS tagging
- Statistical POS tagging 

**Benefits** of **rule-based** Part-of-speech (POS) tagging:
- Simple to implement and understand
- It doesn’t require a lot of computational resources or training data
- It can be easily customized to specific domains or languages

**Disadvantages** of **rule-based** Part-of-speech (POS) tagging:
- Less accurate than statistical taggers
- Limited by the quality and coverage of the rules
- It can be difficult to maintain and update

**Benefits** of **Statistical** Part-of-speech (POS) Tagging:
- More accurate than rule-based taggers
- Don’t require a lot of human-written rules
- Can learn from large amounts of training data

**Disadvantages** of **statistical** Part-of-speech (POS) Tagging:
- Requires more computational resources and training data
- It can be difficult to interpret and debug
- Can be sensitive to the quality and diversity of the training data

I select Statistical POS tagging since the accuracy tends to be higher and since pre-trained POS-models are avilable, the requirement for a lot of training data is no problem. Additionally, the required computational power is no problem due to the small size of the used data for this project. <br>
For more information on the used model, see https://github.com/evanmiltenburg/Dutch-tagger

In [15]:
# # Define a sentence
# text = "Ik ben een student"

# # Process the sentence using spaCy's NLP pipeline
# doc = nlp(text)

# # Iterate through the token and print the token text and POS tag
# for token in doc:
#     print(token.text, token.pos_)

In [16]:
# def POS_tagging(text):
#     """Returns a list of (token, POS tag) tuples for the input text"""
#     doc = nlp(text)
#     pos_tags = [(token.text, token.pos_) for token in doc]
#     return pos_tags

# df_clean['pos_tags'] = df_clean['text'].apply(POS_tagging)

**Lemmatization**

In [17]:
# Sample text
text = "De katten liepen in de tuin."

# Process the text
doc = nlp(text)

# Print PoS tagging and Lemmatization for each token
print(f"{'Token':<15}{'PoS':<15}{'Lemma':<15}")
print("-" * 45)
for token in doc:
    print(f"{token.text:<15}{token.pos_:<15}{token.lemma_:<15}")

Token          PoS            Lemma          
---------------------------------------------
De             DET            de             
katten         NOUN           kat            
liepen         VERB           liepen         
in             ADP            in             
de             DET            de             
tuin           NOUN           tuin           
.              PUNCT          .              


In [18]:
doc = nlp(df_clean.loc[(1,1),'text'])

for token in doc:
    print(token.text, token.pos_, token.lemma_)
    print('')

provincies NOUN provincie

willen VERB willen

slag NOUN slag

versoepeling NOUN versoepeling

stikstofregels NOUN stikstofregel

[NEWLINE] SYM [NEWLINE]

kabinet NOUN kabinet

beoogde VERB beoogen

hogere ADJ hoog

drempelwaarde NOUN drempelwaarde

lijkt VERB lijken

rapport NOUN rapport

[NEWLINE] PRON [NEWLINE]

provincies NOUN provincie

willen VERB willen

‘ PUNCT ‘

voortvarend VERB voortvaren

’ NUM ’

slag NOUN slag

versoepeling NOUN versoepeling

stikstofregels NOUN stikstofregel

[NEWLINE] SYM [NEWLINE]

waarmee ADV waarmee

nieuwe ADJ nieuw

kabinet NOUN kabinet

nederland PROPN Nederland

slot NOUN slot

krijgen VERB krijgen

aannemelijk ADJ aannemelijk

[NEWLINE] SYM [NEWLINE]

belangrijke ADJ belangrijk

horde ADJ horde

stikstofcrisis ADJ stikstofcrisis

groot ADJ groot

aangenomen VERB aannemen

oordelen NOUN oordelen

[NEWLINE] SYM [NEWLINE]

wetenschappers NOUN wetenschapper

tno PRON tno

universiteit NOUN universiteit

amsterdam PROPN Amsterdam

onderzoek NOUN onde

In [19]:
def lemmatization(df, text_column="text", output_column="text"):
    """Lemmatizes the text in a specified column of a DataFrame and adds the results to a new column."""
    
    # Ensure the input column exists in the DataFrame
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' does not exist in the DataFrame.")
        
    # Apply SpaCy processing and lemmatization
    df[output_column] = df[text_column].apply(
        lambda text: " ".join([token.lemma_ for token in nlp(text) if not token.is_punct and not token.is_space]))
    
    return df

df_clean['text before lemmatization'] = df_clean['text'].copy()
df_clean = lemmatization(df_clean, text_column="text")
df_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,original_text,text,group,text before lemmatization
article_id,paragraph_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,Provincies willen aan de slag met versoepeling...,provincie willen slag versoepeling stikstofreg...,Bouw & Vastgoed,provincies willen slag versoepeling stikstofre...
1,2,Het draait allemaal om de drempelwaarde voor e...,draaien allemaal drempelwaran stikstofvergunni...,Bouw & Vastgoed,draait allemaal drempelwaarde stikstofvergunni...
1,3,Met een hogere drempelwaarde zouden minder ver...,hoog drempelwaard vergunning [NEWLINE] aangevo...,Bouw & Vastgoed,hogere drempelwaarde vergunningen [NEWLINE] aa...
1,4,In het hoofdlijnenakkoord hebben de vier coali...,hoofdlijnenakkoord vier coalitiepartij afsprek...,Bouw & Vastgoed,hoofdlijnenakkoord vier coalitiepartijen afges...
1,5,De ondergrens is al langer onderwerp van discu...,ondergren lang onderwerp discussie huidig Nede...,Bouw & Vastgoed,ondergrens langer onderwerp discussie huidige ...
...,...,...,...,...,...
118,2,Telgenkamp vestigt haar hoop voor de korte ter...,telgenkamp vestigen hoop kort termijn twee cru...,Zorg,telgenkamp vestigt hoop korte termijn twee cru...
119,1,Waarom verzekeraars inkomsten uit zwart werk w...,verzekeraar inkomst zwart werk vergoeden [NEWL...,Zorg,verzekeraars inkomsten zwart werk vergoeden [N...
120,1,Verzekeraar wil klant helpen met zorgbemiddeli...,verzekeraar klant helpen zorgbemiddeling [NEWL...,Zorg,verzekeraar klant helpen zorgbemiddeling [NEWL...
120,2,Verzekeraar wil wachtende patiënt aan snelle z...,verzekeraar wachten patiënt snel zorg helpen [...,Zorg,verzekeraar wachtende patiënt snelle zorg help...


**POS Tagging**

In [20]:
def POS_tagging(text):
    """Returns a list of (token, POS tag) tuples for the input text"""
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

df_clean['pos_tags'] = df_clean['text'].apply(POS_tagging)

# Testing for POS and lemmatization

Test if performance increases when POS tags are explicitely used to reinforce lemmatization

**NOTE:** POS-tagging occurs twice in the pre-processing: Once before lemmatization and once after. The first POS-tagging results are used to reinforce the lemmatization by providing more detailed input. After Lemmatization, POS-tagging are once again obtained to ensure that the final POS-tags match the final text.

In [39]:
df_clean_experimental = df_clean.copy()

In [48]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk

In [55]:
# Uncomment and execute the 2 lines below to install the required nltk files, which only needs to be done once.

# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [49]:
# Initialize NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

In [30]:
# Function to convert spaCy POS to WordNet POS (needed for accurate lemmatization)
def spacy_to_wordnet_pos(spacy_pos):
    if spacy_pos.startswith('N'):  # Noun
        return wordnet.NOUN
    elif spacy_pos.startswith('V'):  # Verb
        return wordnet.VERB
    elif spacy_pos.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif spacy_pos.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

In [46]:
def lemmatize_with_pos(pos_tags):
    return " ".join([lemmatizer.lemmatize(word, spacy_to_wordnet_pos(pos)) for word, pos in pos_tags])

In [33]:
def get_pos_tags_after_lemmatization(lemmatized_text):
    doc = nlp(lemmatized_text)
    return [(token.text, token.pos_) for token in doc]

In [45]:
df_clean_experimental['pos_tags_before_lemmatization'] = df_clean_experimental['text before lemmatization'].apply(POS_tagging)
print(df_clean_experimental['pos_tags_before_lemmatization'])

article_id  paragraph_id
1           1               [(provincies, NOUN), (willen, VERB), (slag, NO...
            2               [(draait, VERB), (allemaal, ADV), (drempelwaar...
            3               [(hogere, ADJ), (drempelwaarde, VERB), (vergun...
            4               [(hoofdlijnenakkoord, INTJ), (vier, NUM), (coa...
            5               [(ondergrens, NOUN), (langer, ADJ), (onderwerp...
                                                  ...                        
118         2               [(telgenkamp, NOUN), (vestigt, VERB), (hoop, N...
119         1               [(verzekeraars, NOUN), (inkomsten, NOUN), (zwa...
120         1               [(verzekeraar, ADJ), (klant, NOUN), (helpen, V...
            2               [(verzekeraar, ADJ), (wachtende, VERB), (patië...
            3               [(zorgbemiddeling, NOUN), (wondermiddel, NOUN)...
Name: pos_tags_before_lemmatization, Length: 374, dtype: object


In [56]:
df_clean_experimental['text'] = df_clean_experimental['pos_tags_before_lemmatization'].apply(lemmatize_with_pos)
print(df_clean_experimental['text'])

article_id  paragraph_id
1           1               provincies willen slag versoepeling stikstofre...
            2               draait allemaal drempelwaarde stikstofvergunni...
            3               hogere drempelwaarde vergunningen [NEWLINE] aa...
            4               hoofdlijnenakkoord vier coalitiepartijen afges...
            5               ondergrens langer onderwerp discussie huidige ...
                                                  ...                        
118         2               telgenkamp vestigt hoop korte termijn twee cru...
119         1               verzekeraars inkomsten zwart werk vergoeden [N...
120         1               verzekeraar klant helpen zorgbemiddeling [NEWL...
            2               verzekeraar wachtende patiënt snelle zorg help...
            3               zorgbemiddeling wondermiddel helpen zegt haarl...
Name: text, Length: 374, dtype: object


In [59]:
df_clean_experimental['pos_tags_after_lemmatization'] = df_clean_experimental['text'].apply(get_pos_tags_after_lemmatization)
print(df_clean_experimental['pos_tags_after_lemmatization'])

article_id  paragraph_id
1           1               [(provincies, NOUN), (willen, VERB), (slag, NO...
            2               [(draait, VERB), (allemaal, ADV), (drempelwaar...
            3               [(hogere, ADJ), (drempelwaarde, VERB), (vergun...
            4               [(hoofdlijnenakkoord, INTJ), (vier, NUM), (coa...
            5               [(ondergrens, NOUN), (langer, ADJ), (onderwerp...
                                                  ...                        
118         2               [(telgenkamp, NOUN), (vestigt, VERB), (hoop, N...
119         1               [(verzekeraars, NOUN), (inkomsten, NOUN), (zwa...
120         1               [(verzekeraar, ADJ), (klant, NOUN), (helpen, V...
            2               [(verzekeraar, ADJ), (wachtende, VERB), (patië...
            3               [(zorgbemiddeling, NOUN), (wondermiddel, NOUN)...
Name: pos_tags_after_lemmatization, Length: 374, dtype: object


**POS Normalization**

In [None]:
#For now, we do not implement this since it clashes with NER. 
#If we need to reduce the dimension of the data, we can implement it later

**Dependency Parsing**

In [None]:
#May overcomplicate the data, check if it improves performance

**NER**

**Stemming** (Altenative for Lemmatization. Check when this should be done in the pre-processing pipeline)

**Resulting DataFrame**

In [21]:
df_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,original_text,text,group,text before lemmatization,pos_tags
article_id,paragraph_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,Provincies willen aan de slag met versoepeling...,provincie willen slag versoepeling stikstofreg...,Bouw & Vastgoed,provincies willen slag versoepeling stikstofre...,"[(provincie, NOUN), (willen, VERB), (slag, NOU..."
1,2,Het draait allemaal om de drempelwaarde voor e...,draaien allemaal drempelwaran stikstofvergunni...,Bouw & Vastgoed,draait allemaal drempelwaarde stikstofvergunni...,"[(draaien, VERB), (allemaal, ADV), (drempelwar..."
1,3,Met een hogere drempelwaarde zouden minder ver...,hoog drempelwaard vergunning [NEWLINE] aangevo...,Bouw & Vastgoed,hogere drempelwaarde vergunningen [NEWLINE] aa...,"[(hoog, ADJ), (drempelwaard, NOUN), (vergunnin..."
1,4,In het hoofdlijnenakkoord hebben de vier coali...,hoofdlijnenakkoord vier coalitiepartij afsprek...,Bouw & Vastgoed,hoofdlijnenakkoord vier coalitiepartijen afges...,"[(hoofdlijnenakkoord, PROPN), (vier, NUM), (co..."
1,5,De ondergrens is al langer onderwerp van discu...,ondergren lang onderwerp discussie huidig Nede...,Bouw & Vastgoed,ondergrens langer onderwerp discussie huidige ...,"[(ondergren, VERB), (lang, ADJ), (onderwerp, N..."
...,...,...,...,...,...,...
118,2,Telgenkamp vestigt haar hoop voor de korte ter...,telgenkamp vestigen hoop kort termijn twee cru...,Zorg,telgenkamp vestigt hoop korte termijn twee cru...,"[(telgenkamp, NOUN), (vestigen, VERB), (hoop, ..."
119,1,Waarom verzekeraars inkomsten uit zwart werk w...,verzekeraar inkomst zwart werk vergoeden [NEWL...,Zorg,verzekeraars inkomsten zwart werk vergoeden [N...,"[(verzekeraar, ADJ), (inkomst, NOUN), (zwart, ..."
120,1,Verzekeraar wil klant helpen met zorgbemiddeli...,verzekeraar klant helpen zorgbemiddeling [NEWL...,Zorg,verzekeraar klant helpen zorgbemiddeling [NEWL...,"[(verzekeraar, ADJ), (klant, NOUN), (helpen, V..."
120,2,Verzekeraar wil wachtende patiënt aan snelle z...,verzekeraar wachten patiënt snel zorg helpen [...,Zorg,verzekeraar wachtende patiënt snelle zorg help...,"[(verzekeraar, NOUN), (wachten, VERB), (patiën..."


In [60]:
df_clean_experimental

Unnamed: 0_level_0,Unnamed: 1_level_0,original_text,text,group,text before lemmatization,pos_tags,pos_tags_before,pos_tags_before_lemmatization,pos_tags_after_lemmatization
article_id,paragraph_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,Provincies willen aan de slag met versoepeling...,provincies willen slag versoepeling stikstofre...,Bouw & Vastgoed,provincies willen slag versoepeling stikstofre...,"[(provincie, NOUN), (willen, VERB), (slag, NOU...","[(provincies, NOUN), (willen, VERB), (slag, NO...","[(provincies, NOUN), (willen, VERB), (slag, NO...","[(provincies, NOUN), (willen, VERB), (slag, NO..."
1,2,Het draait allemaal om de drempelwaarde voor e...,draait allemaal drempelwaarde stikstofvergunni...,Bouw & Vastgoed,draait allemaal drempelwaarde stikstofvergunni...,"[(draaien, VERB), (allemaal, ADV), (drempelwar...","[(draait, VERB), (allemaal, ADV), (drempelwaar...","[(draait, VERB), (allemaal, ADV), (drempelwaar...","[(draait, VERB), (allemaal, ADV), (drempelwaar..."
1,3,Met een hogere drempelwaarde zouden minder ver...,hogere drempelwaarde vergunningen [NEWLINE] aa...,Bouw & Vastgoed,hogere drempelwaarde vergunningen [NEWLINE] aa...,"[(hoog, ADJ), (drempelwaard, NOUN), (vergunnin...","[(hogere, ADJ), (drempelwaarde, VERB), (vergun...","[(hogere, ADJ), (drempelwaarde, VERB), (vergun...","[(hogere, ADJ), (drempelwaarde, VERB), (vergun..."
1,4,In het hoofdlijnenakkoord hebben de vier coali...,hoofdlijnenakkoord vier coalitiepartijen afges...,Bouw & Vastgoed,hoofdlijnenakkoord vier coalitiepartijen afges...,"[(hoofdlijnenakkoord, PROPN), (vier, NUM), (co...","[(hoofdlijnenakkoord, INTJ), (vier, NUM), (coa...","[(hoofdlijnenakkoord, INTJ), (vier, NUM), (coa...","[(hoofdlijnenakkoord, INTJ), (vier, NUM), (coa..."
1,5,De ondergrens is al langer onderwerp van discu...,ondergrens langer onderwerp discussie huidige ...,Bouw & Vastgoed,ondergrens langer onderwerp discussie huidige ...,"[(ondergren, VERB), (lang, ADJ), (onderwerp, N...","[(ondergrens, NOUN), (langer, ADJ), (onderwerp...","[(ondergrens, NOUN), (langer, ADJ), (onderwerp...","[(ondergrens, NOUN), (langer, ADJ), (onderwerp..."
...,...,...,...,...,...,...,...,...,...
118,2,Telgenkamp vestigt haar hoop voor de korte ter...,telgenkamp vestigt hoop korte termijn twee cru...,Zorg,telgenkamp vestigt hoop korte termijn twee cru...,"[(telgenkamp, NOUN), (vestigen, VERB), (hoop, ...","[(telgenkamp, NOUN), (vestigt, VERB), (hoop, N...","[(telgenkamp, NOUN), (vestigt, VERB), (hoop, N...","[(telgenkamp, NOUN), (vestigt, VERB), (hoop, N..."
119,1,Waarom verzekeraars inkomsten uit zwart werk w...,verzekeraars inkomsten zwart werk vergoeden [N...,Zorg,verzekeraars inkomsten zwart werk vergoeden [N...,"[(verzekeraar, ADJ), (inkomst, NOUN), (zwart, ...","[(verzekeraars, NOUN), (inkomsten, NOUN), (zwa...","[(verzekeraars, NOUN), (inkomsten, NOUN), (zwa...","[(verzekeraars, NOUN), (inkomsten, NOUN), (zwa..."
120,1,Verzekeraar wil klant helpen met zorgbemiddeli...,verzekeraar klant helpen zorgbemiddeling [NEWL...,Zorg,verzekeraar klant helpen zorgbemiddeling [NEWL...,"[(verzekeraar, ADJ), (klant, NOUN), (helpen, V...","[(verzekeraar, ADJ), (klant, NOUN), (helpen, V...","[(verzekeraar, ADJ), (klant, NOUN), (helpen, V...","[(verzekeraar, ADJ), (klant, NOUN), (helpen, V..."
120,2,Verzekeraar wil wachtende patiënt aan snelle z...,verzekeraar wachtende patiënt snelle zorg help...,Zorg,verzekeraar wachtende patiënt snelle zorg help...,"[(verzekeraar, NOUN), (wachten, VERB), (patiën...","[(verzekeraar, ADJ), (wachtende, VERB), (patië...","[(verzekeraar, ADJ), (wachtende, VERB), (patië...","[(verzekeraar, ADJ), (wachtende, VERB), (patië..."


# POS-tags one-hot encoding

In [86]:
def pos_to_features(pos_tags):
    """Convert list of (word, POS) tuples into a dictionary of POS tag counts."""
    pos_counts = Counter(tag for _, tag in pos_tags)
    return dict(pos_counts)

# df_clean_experimental["pos_features"] = df_clean_experimental["pos_tags_after_lemmatization"].apply(pos_to_features)

# # Convert to feature matrix
# pos_vectorizer = DictVectorizer(sparse=True)
# X_pos = pos_vectorizer.fit_transform(df_clean_experimental["pos_features"])

In [94]:
X = df_clean_experimental[['text', 'pos_tags_after_lemmatization']]  # Feature: text column
# y = df_clean_experimental['group']  # Label: group column

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_clean_experimental["group"])

# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Split the data into text and POS-tags
X_train_text, X_train_pos = X_train[['text']], X_train[['pos_tags_after_lemmatization']]
X_val_text, X_val_pos = X_val[['text']], X_val[['pos_tags_after_lemmatization']]
X_test_text, X_test_pos = X_test[['text']], X_test[['pos_tags_after_lemmatization']]

# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_text = vectorizer.fit_transform(X_train_text)
X_val_text = vectorizer.transform(X_val_text)
X_test_text = vectorizer.transform(X_test_text)

# Convert pos tags to feature matrix
pos_vectorizer = DictVectorizer(sparse=True)
scaler = MaxAbsScaler()

X_train_pos["pos_features"] = X_train_pos["pos_tags_after_lemmatization"].apply(pos_to_features)
X_train_pos = pos_vectorizer.fit_transform(X_train_pos["pos_features"])
X_train_pos = scaler.fit_transform(X_train_pos)

X_val_pos["pos_features"] = X_val_pos["pos_tags_after_lemmatization"].apply(pos_to_features)
X_val_pos = pos_vectorizer.fit_transform(X_val_pos["pos_features"])
X_val_pos = scaler.fit_transform(X_val_pos)

X_test_pos["pos_features"] = X_test_pos["pos_tags_after_lemmatization"].apply(pos_to_features)
X_test_pos = pos_vectorizer.fit_transform(X_test_pos["pos_features"])
X_test_pos = scaler.fit_transform(X_test_pos)

In [96]:
# Combine features

X_train = hstack([X_train_text, X_train_pos])
X_train

ValueError: Mismatching dimensions along axis 0: {1, 261}

In [95]:
X_train_text

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

# Vector Transformation

For now, I only test TF-IDF

In [None]:
# Extract features and labels
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

In [None]:
# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
X_train_tfidf

In [None]:
X_val_tfidf

In [None]:
X_test_tfidf

**Now I test BERTje Embeddings**

In [None]:
df_clean_bertje = df_clean.copy()

# Load BERTje tokenizer and model
MODEL_NAME = "GroNLP/bert-base-dutch-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
# Encode labels (convert string labels to integers)
label_mapping = {label: idx for idx, label in enumerate(df_clean_bertje["group"].unique())}
df_clean_bertje["label"] = df_clean_bertje["group"].map(label_mapping)

# Split dataset
X_train_bertje, X_temp_bertje, y_train_bertje, y_temp_bertje = train_test_split(
    df_clean_bertje["text"].tolist(), df_clean_bertje["label"].tolist(), test_size=0.3, random_state=42)
X_val_bertje, X_test_bertje, y_val_bertje, y_test_bertje = train_test_split(X_temp_bertje, y_temp_bertje, test_size=0.5, random_state=42)

In [None]:
def get_bert_embedding(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation

# Convert text data into embeddings
train_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_train_bertje]).numpy()
val_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_val_bertje]).numpy()
test_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_test_bertje]).numpy()

In [None]:
# Test, remove me

def get_bert_embedding(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation

# Convert text data into embeddings
train_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_train_bertje]).numpy()
val_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_val_bertje]).numpy()
test_embeddings = torch.stack([torch.tensor(get_bert_embedding(text)) for text in X_test_bertje]).numpy()

**Now I test Word2Vec Embeddings**

# Random Forest Classifier

In [None]:
# Tune the depth of the Random Forest using the validation set
best_depth = None
best_score = 0
depths = [5, 10, 15, 20, 25, None]  # Different depths to test

for depth in depths:
    classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    classifier.fit(X_train_tfidf, y_train)
    val_score = classifier.score(X_val_tfidf, y_val)
    print(f"Depth: {depth}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_depth = depth

print(f"\nBest Depth: {best_depth}, Best Validation Score: {best_score}")

# Train the final model using the best depth
final_classifier = RandomForestClassifier(max_depth=best_depth, random_state=42)
final_classifier.fit(X_train_tfidf, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_tfidf)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Now we show the accuracy per class and visualize them as a confusion matrix

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

# Support Vector Machine

In [None]:
# Tune the SVM hyperparameters using the validation set
best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(X_train_tfidf, y_train)
        val_score = classifier.score(X_val_tfidf, y_val)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(X_train_tfidf, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_tfidf)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Now we show the accuracy per class and visualize them as a confusion matrix

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

# Naive Bayes

In [None]:
# Tune the Naive Bayes hyperparameters using the validation set
best_alpha = None
best_fit_prior = None
best_score = float('-inf')

# Test different hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]

for fit_prior_value in fit_prior_values:
    for alpha in alpha_values:
        classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
        classifier.fit(X_train_tfidf, y_train)
        val_score = classifier.score(X_val_tfidf, y_val)
        print(f"Alpha: {alpha}, fit_prior: {fit_prior_value}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_alpha = alpha
            best_fit_prior = fit_prior_value

print(f"\nBest alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best Validation Score: {best_score}")

# Train the final model using the best alpha and fit_prior
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Test the model
y_pred = final_classifier.predict(X_test_tfidf)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

Now we show the accuracy per class and visualize them as a confusion matrix

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

# Leave-one-out cross validation

In [None]:
# Extract features and labels
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

# Split the data into training (85%) and test (15%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize Leave-One-Out Cross-Validation
loo = LeaveOneOut()

# Hyperparameter tuning with LOO-CV
best_alpha = None
best_fit_prior = None
best_score = float('-inf')

# Define hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]

# Try different hyperparameter combinations
for fit_prior_value in fit_prior_values:
    for alpha in alpha_values:
        scores = []
        
        for train_index, val_index in loo.split(X_train_tfidf):
            X_train_cv, X_val = X_train_tfidf[train_index], X_train_tfidf[val_index]
            y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
            
            classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
            classifier.fit(X_train_cv, y_train_cv)
            
            y_pred = classifier.predict(X_val)
            scores.append(accuracy_score(y_val, y_pred))
        
        mean_score = np.mean(scores)
        print(f"Alpha: {alpha}, fit_prior: {fit_prior_value}, LOO-CV Score: {mean_score}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_alpha = alpha
            best_fit_prior = fit_prior_value

print(f"\nBest alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best LOO-CV Score: {best_score}")

# Train the final model using the best hyperparameters
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Evaluate on the test set
y_pred = final_classifier.predict(X_test_tfidf)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

In [None]:
# This leads to worse performance. Likely since Leave-one-out cross validation tends to create High-Variance Models.
# Instead, I will use stratified K-fold cross validation. K will be treated as a hyperparameter.

In [None]:
# Extract features and labels
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

# Split the data into training (85%) and test (15%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Convert text to TF-IDF representation
vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Hyperparameter tuning with Stratified K-Fold CV
best_alpha = None
best_fit_prior = None
best_k = None
best_score = float('-inf')

# Define hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]
k_values = [2, 3, 5, 10, 20]  # Different values for K in StratifiedKFold

# Try different hyperparameter combinations
for k in k_values:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    for fit_prior_value in fit_prior_values:
        for alpha in alpha_values:
            scores = []
            
            for train_index, val_index in skf.split(X_train_tfidf, y_train):
                X_train_cv, X_val = X_train_tfidf[train_index], X_train_tfidf[val_index]
                y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
                
                classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                classifier.fit(X_train_cv, y_train_cv)
                
                y_pred = classifier.predict(X_val)
                scores.append(accuracy_score(y_val, y_pred))
            
            mean_score = np.mean(scores)
            print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, StratifiedKFold Score: {mean_score}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_alpha = alpha
                best_fit_prior = fit_prior_value
                best_k = k

print(f"\nBest K: {best_k}, Best alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best StratifiedKFold Score: {best_score}")

# Train the final model using the best hyperparameters
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Evaluate on the test set
y_pred = final_classifier.predict(X_test_tfidf)
y_pred_prob = final_classifier.predict_proba(X_test_tfidf)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
y_pred_prob

for row in y_pred_prob:
    formatted_row = ["{:.4f}".format(val) for val in row]
    print(formatted_row)

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (assuming they are in the same order as in y_train or y_test)
class_names = np.unique(y_test)  # This will give you the unique class labels

# Calculate per-class accuracy: TP / (TP + FN)
class_accuracies = cm.diagonal() / cm.sum(axis=1)

# Print the accuracy for each class along with its name
for i, acc in enumerate(class_accuracies):
    print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Class names (from y_test)
class_names = np.unique(y_test)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)

# Label the axes
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()

In [None]:
#Note, need to ensure TF-IDF vectorization happens within each fold to prevent leakage.

# TF-IDF vectorization within folds to avoid data leakage

Currently, vectorizations occurs over the full training set. <br>
However, we train K models, 1 for each fold. <br>
This means that for each fold, vectorizations should occur for the training data for that specific fold. <br>
This avoids data leakage from our validation set to our training set. <br>
Note that this is not strictly needed (since not separately within each fold is usually acceptable), but it should slightly improve performance at the cost of additional runtime.

In [None]:
# Assume df_clean is already defined with 'text' and 'group' columns
X = df_clean['text']  # Feature: text column
y = df_clean['group']  # Label: group column

# Split the data into training (85%) and test (15%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Hyperparameter tuning with Stratified K-Fold CV
best_alpha = None
best_fit_prior = None
best_k = None
best_score = float('-inf')

# Define hyperparameter values
alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
fit_prior_values = [True, False]
k_values = [2, 3, 5, 10, 20]  # Different values for K in StratifiedKFold

# Try different hyperparameter combinations
for k in k_values:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    for fit_prior_value in fit_prior_values:
        for alpha in alpha_values:
            scores = []
            
            for train_index, val_index in skf.split(X_train, y_train):
                # Split the raw text data for the current fold
                X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                
                # Vectorize within the fold: fit on training fold, transform validation fold
                vectorizer = TfidfVectorizer(max_features=10000)
                X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                X_val_fold_tfidf = vectorizer.transform(X_val_fold)
                
                # Initialize and train the classifier
                classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                classifier.fit(X_train_fold_tfidf, y_train_fold)
                
                # Validate the model
                y_pred = classifier.predict(X_val_fold_tfidf)
                scores.append(accuracy_score(y_val_fold, y_pred))
            
            mean_score = np.mean(scores)
            print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, StratifiedKFold Score: {mean_score}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_alpha = alpha
                best_fit_prior = fit_prior_value
                best_k = k

print(f"\nBest K: {best_k}, Best alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best StratifiedKFold Score: {best_score}")

# Final model training on the entire training set using the best hyperparameters
# Here, we fit the vectorizer on the full training set
final_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = final_vectorizer.fit_transform(X_train)
final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
final_classifier.fit(X_train_tfidf, y_train)

# Transform the test set using the vectorizer fitted on the entire training set
X_test_tfidf = final_vectorizer.transform(X_test)
y_pred = final_classifier.predict(X_test_tfidf)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred))

# Turning model tuning into function

Since the process of tuning a classifier tends to not change much, we create a function for every type of classifier so that we can tune them without needing to re-write the code every time.

In [None]:
def tune_random_forest(df: pd.DataFrame,
                       testing_ratio: float = 0.15, 
                       vectorization_within_folds: bool = False,
                       k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the Random Forest using stratified K-fold cross validation
    best_depth = None
    best_score = 0
    depth_values = [5, 10, 15, 20, 25, None]  # Different depths to test
    # depth_values = [5, 10]  # For faster tests
    
    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for depth in depth_values:
                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_tfidf, y_train)
                y_pred = classifier.predict(X_val_tfidf)
                mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                print(f"K: {k}, Depth: {depth}, Validation Accuracy: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_depth = depth
                    best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
        
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for depth in depth_values:
            scores = []

            if vectorization_within_folds:
                for train_index, val_index in skf.split(X_train, y_train):
                    # Split the raw text data for the current fold
                    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                vectorizer = TfidfVectorizer(max_features=10000)
                X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                X_val_fold_tfidf = vectorizer.transform(X_val_fold)

                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_fold_tfidf, y_train_fold)
                y_pred = classifier.predict(X_val_fold_tfidf)

            if not vectorization_within_folds:
                for train_index, val_index in skf.split(X_train_tfidf, y_train):
                    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                
                classifier = RandomForestClassifier(max_depth=depth, random_state=42)
                classifier.fit(X_train_fold, y_train_fold)
                y_pred = classifier.predict(X_val_fold)
                    
            scores.append(accuracy_score(y_val_fold, y_pred))
            
            mean_score = np.mean(scores)
            print(f"K: {k}, Depth: {depth}, StratifiedKFold Score: {mean_score}")
            
            if mean_score > best_score:
                best_score = mean_score
                best_depth = depth
                best_k = k
    
    print(f"\nBest K: {best_k}, Best depth: {best_depth}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

In [None]:
def tune_SVM(df: pd.DataFrame,
             testing_ratio: float = 0.15, 
             vectorization_within_folds: bool = False,
             k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the SVM classifier using stratified K-fold cross validation
    best_kernel = None
    best_C = None
    best_score = 0
    
    kernels = ['linear', 'rbf', 'poly']
    C_values = [0.1, 1, 10]    

    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for C in C_values:
                for kernel in kernels:
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_tfidf, y_train)
                    y_pred = classifier.predict(X_val_tfidf)
                    mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                    print(f"K: {k}, C: {C}, Kernel: {kernel}, Validation Accuracy: {mean_score}")
                    
                    if mean_score > best_score:
                        best_score = mean_score
                        best_C = C
                        best_kernel = kernel
                        best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
            
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for C in C_values:
            for kernel in kernels:
                scores = []
    
                if vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train, y_train):
                        # Split the raw text data for the current fold
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
                    vectorizer = TfidfVectorizer(max_features=10000)
                    X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                    X_val_fold_tfidf = vectorizer.transform(X_val_fold)
    
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_fold_tfidf, y_train_fold)
                    y_pred = classifier.predict(X_val_fold_tfidf)
    
                if not vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train_tfidf, y_train):
                        X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                    
                    classifier = SVC(kernel=kernel, C=C, random_state=42)
                    classifier.fit(X_train_fold, y_train_fold)
                    y_pred = classifier.predict(X_val_fold)
                        
                scores.append(accuracy_score(y_val_fold, y_pred))
                
                mean_score = np.mean(scores)
                print(f"K: {k}, C: {C}, Kernel: {kernel}, StratifiedKFold Score: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_C = C
                    best_kernel = kernel
                    best_k = k
    
    print(f"\nBest K: {best_k}, Best C: {best_C}, Best kernel: {best_kernel}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

In [None]:
def tune_naive_bayes(df: pd.DataFrame,
                     testing_ratio: float = 0.15, 
                     vectorization_within_folds: bool = False,
                     k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    # Tune the hyperparameters of the SVM classifier using stratified K-fold cross validation
    best_alpha = None
    best_fit_prior = None
    best_score = 0
    
    alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    fit_prior_values = [True, False]  

    # Extract features and labels
    X = df['text']  # Feature: text column
    y = df['group']  # Label: group column
    
    # Split the data into training (85%) and test (15%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    if not vectorization_within_folds:
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
    
    # Try different hyperparameter combinations
    for k in k_values:
        
        if k == 1: # If k=1, we use standard hold-out cross-validation
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=testing_ratio*2, random_state=42) # testing_ratio is multiplied by 2 since it is split into validation and test sets after
            X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
            
            vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_val_tfidf = vectorizer.transform(X_val)
            X_test_tfidf = vectorizer.transform(X_test)
            
            for alpha in alpha_values:
                for fit_prior_value in fit_prior_values:
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_tfidf, y_train)
                    y_pred = classifier.predict(X_val_tfidf)
                    mean_score = accuracy_score(y_val, y_pred)  # Validation accuracy directly
                    print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, Validation Accuracy: {mean_score}")
                    
                    if mean_score > best_score:
                        best_score = mean_score
                        best_alpha = alpha
                        best_fit_prior = fit_prior_value
                        best_k = k
                        
            # Reset X_train, X_test, X_test_tfidf, y_train and y_test after they were changed for the hold-out cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
            X_train_tfidf = vectorizer.fit_transform(X_train)
            X_test_tfidf = vectorizer.transform(X_test)
            continue # Skip the StratifiedKFold part for K=1
            
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        
        for alpha in alpha_values:
            for fit_prior_value in fit_prior_values:
                scores = []
    
                if vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train, y_train):
                        # Split the raw text data for the current fold
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
                    vectorizer = TfidfVectorizer(max_features=10000)
                    X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
                    X_val_fold_tfidf = vectorizer.transform(X_val_fold)
    
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_fold_tfidf, y_train_fold)
                    y_pred = classifier.predict(X_val_fold_tfidf)
    
                if not vectorization_within_folds:
                    for train_index, val_index in skf.split(X_train_tfidf, y_train):
                        X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
                    
                    classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior_value)
                    classifier.fit(X_train_fold, y_train_fold)
                    y_pred = classifier.predict(X_val_fold)
                        
                scores.append(accuracy_score(y_val_fold, y_pred))
                
                mean_score = np.mean(scores)
                print(f"K: {k}, Alpha: {alpha}, fit_prior: {fit_prior_value}, StratifiedKFold Score: {mean_score}")
                
                if mean_score > best_score:
                    best_score = mean_score
                    best_alpha = alpha
                    best_fit_prior = fit_prior_value
                    best_k = k
    
    print(f"\nBest K: {best_k}, Best alpha: {best_alpha}, Best fit_prior: {best_fit_prior}, Best StratifiedKFold Score: {best_score}")
    
    # Train the final model using the best hyperparameters
    if vectorization_within_folds: # X_train_tfidf has yet to be calculated in this case
        vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        
    final_classifier = MultinomialNB(alpha=best_alpha, fit_prior=best_fit_prior)
    final_classifier.fit(X_train_tfidf, y_train)
    
    # Evaluate on the test set
    y_pred = final_classifier.predict(X_test_tfidf)
    # y_pred_prob = final_classifier.predict_proba(X_test_tfidf)
    
    print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
    print("\nTest Classification Report:\n", classification_report(y_test, y_pred))
    
    return final_classifier, best_k, best_depth, y_pred

# Overarching Model Selection Function

Expected parameter values: <br>
> - _classifier_: Selects the type of classifier from amongst the following: ["SVM", "NB", "RF"].
> - _testing_ratio_: The ratio of the data that is reserved for testing. Any floating point in the inclusive interval [0, 1].
> > Note that if $k=1$, the size of the validation set is assumed to be equal to the size of the testing set, specified by _testing_ratio_.
> - _vectorization_within_folds_: Would you like to vectorize each individual fold rather than vectorizing the entire training set once? [True, False].
> - _show_class_accuracy_: Would you like the accuracy per class to be displayed? [True, False].
> - _show_confusion_matrix_: Would you like the resulting confusion matrix to be displayed? [True, False].
> - _k_values_: All values of k which are tested for stratified k-fold cross validation. Any list containing only positive integers.

In [None]:
def train_model(df: pd.DataFrame,
                model_type: str = "SVM", 
                testing_ratio: float = 0.15, 
                vectorization_within_folds: bool = False, 
                show_class_accuracy: bool = True, 
                show_confusion_matrix: bool = True,
                k_values: list = [2, 3, 5, 10, 20]):
    """..."""

    print(f"Tuning {model_type} classifier with a train/test split of {1-testing_ratio}/{testing_ratio} \n")
    
    #Raise appropriate error message in case of a faulty parameter value
    if not isinstance(df, pd.DataFrame):
        raise ValueError(f"Invalid input data. Please ensure df is a Pandas DataFrame")
    if model_type not in ["SVM", "NB", "RF"]:
        raise ValueError(f"Invalid model_type. Choose from {'SVM', 'NB', 'RF'}")
    if testing_ratio < 0 or testing_ratio > 1:
        raise ValueError(f"Invalid testing ratio. Choose a value in the inclusive interval [0,1]")
    if type(vectorization_within_folds) != bool:
        raise ValueError(f"Invalid vectorization_within_folds value. Please ensure vectorization_within_folds is boolean")
    if type(show_class_accuracy) != bool:
        raise ValueError(f"Invalid show_class_accuracy value. Please ensure show_class_accuracy is boolean")
    if type(show_confusion_matrix) != bool:
        raise ValueError(f"Invalid show_confusion_matrix value. Please ensure show_confusion_matrix is boolean")
    if not all(isinstance(x, int) and x > 0 for x in k_values):
        raise ValueError(f"Invalid k_values. Please ensure all entries in k_values are positive integers")
    if 1 in k_values and vectorization_within_folds:
        raise ValueError(f"If k_values contains 1, vectorization_within_folds must be False since k=1 implies standard hold-out cross-validation, for which vectorization_within_folds must be False")

    model_mapping = {"SVM": SVC, "NB": MultinomialNB , "RF": RandomForestClassifier}
    ModelClass = model_mapping[model_type]
    
    #Best classifier performance and number of stratified folds so far
    best_score = 0
    best_k = None

    #RF hyperparameters:
    best_depth = None
    depth_values = [5, 10, 15, 20, 25, None]

    #SVM hyperparameters:
    best_kernel = None
    best_C = None
    kernel_values = ['linear', 'rbf', 'poly']
    C_values = [0.1, 1, 10]

    #NB hyperparameters:
    best_alpha = None
    best_fit_prior = None
    alpha_values = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    fit_prior_values = [True, False]
    
    if model_type == "SVM":
        results = tune_SVM(df=df, testing_ratio=testing_ratio, vectorization_within_folds=vectorization_within_folds, k_values=k_values)
        classifier, y_pred = results[0], results[3]

    if model_type == "NB":
        results = tune_naive_bayes(df=df, testing_ratio=testing_ratio, vectorization_within_folds=vectorization_within_folds, k_values=k_values)
        classifier, y_pred = results[0], results[3]

    if model_type == "RF":
        results = tune_random_forest(df=df, testing_ratio=testing_ratio, vectorization_within_folds=vectorization_within_folds, k_values=k_values)
        classifier, y_pred = results[0], results[3]
    
    if show_class_accuracy:
        cm = confusion_matrix(y_test, y_pred)
        
        # Class names (assuming they are in the same order as in y_train or y_test)
        class_names = np.unique(y_test)  # This will give you the unique class labels
        
        # Calculate per-class accuracy: TP / (TP + FN)
        class_accuracies = cm.diagonal() / cm.sum(axis=1)
        
        # Print the accuracy for each class along with its name
        for i, acc in enumerate(class_accuracies):
            print(f"Class '{class_names[i]}' Accuracy: {acc:.4f}")

    if show_confusion_matrix:
        cm = confusion_matrix(y_test, y_pred)
        class_names = np.unique(y_test)
        
        # Plotting the confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, cbar=True)
        
        # Label the axes
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title('Confusion Matrix')
        
        # Display the plot
        plt.show()

# Testing Function

The code below serves only to test the _train_model_ function and to detect and remove bugs. <br>
The specific parameter values hold no significance.

In [None]:
train_model(df=df_clean,
            model_type = "RF", 
            testing_ratio = 0.15, 
            vectorization_within_folds = False, 
            show_class_accuracy = True, 
            show_confusion_matrix = True)

In [None]:
train_model(df=df_clean,
            model_type = "SVM", 
            testing_ratio = 0.15, 
            vectorization_within_folds = False, 
            show_class_accuracy = True, 
            show_confusion_matrix = True,
            k_values = [1,2,5,10,20])

In [None]:
train_model(df=df_clean,
            model_type = "NB", 
            testing_ratio = 0.2, 
            vectorization_within_folds = True, 
            show_class_accuracy = True, 
            show_confusion_matrix = True,
           k_values = [2,12,15])

# Showing percentages per class

First for SVM classification

Now for Naive Bayes

In [None]:
#NOTE: i have noticed that test performance tends to be higher for k>1 even though the validation score would suggest that k=1 is best. 
#I think this is because stratified cross validation generalizes better.
#How do I decide on k? I can't run everything over test set, that would turn test set into 2nd validation set.

# Testing BERTje word embeddings

**NOTE**: These are just basic tests to see if the word embeddings hold potential. <br>
The option to use word embeddings should be added to the train_model function above

First we test random forests

In [None]:
# Tune the depth of the Random Forest using the validation set
best_depth = None
best_score = 0
depths = [5, 10, 15, 20, 25, None]  # Different depths to test

for depth in depths:
    classifier = RandomForestClassifier(max_depth=depth, random_state=42)
    classifier.fit(train_embeddings, y_train_bertje)
    val_score = classifier.score(val_embeddings, y_val_bertje)
    print(f"Depth: {depth}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_depth = depth

print(f"\nBest Depth: {best_depth}, Best Validation Score: {best_score}")

# Train the final model using the best depth
final_classifier = RandomForestClassifier(max_depth=best_depth, random_state=42)
final_classifier.fit(train_embeddings, y_train_bertje)

# Test the model
y_pred = final_classifier.predict(test_embeddings)
print("\nTest Accuracy:", accuracy_score(y_test_bertje, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test_bertje, y_pred))

Now we Test SVM:

In [None]:
best_kernel = None
best_C = None
best_score = 0

# Test different kernels and values of C (Regularization parameter)
kernels = ['linear', 'rbf', 'poly']
C_values = [0.1, 1, 10]

for kernel in kernels:
    for C in C_values:
        classifier = SVC(kernel=kernel, C=C, random_state=42)
        classifier.fit(train_embeddings, y_train_bertje)
        val_score = classifier.score(val_embeddings, y_val_bertje)
        print(f"Kernel: {kernel}, C: {C}, Validation Score: {val_score}")
        if val_score > best_score:
            best_score = val_score
            best_kernel = kernel
            best_C = C

print(f"\nBest Kernel: {best_kernel}, Best C: {best_C}, Best Validation Score: {best_score}")

# Train the final model using the best kernel and C
final_classifier = SVC(kernel=best_kernel, C=best_C, random_state=42)
final_classifier.fit(train_embeddings, y_train_bertje)

# Test the model
y_pred = final_classifier.predict(test_embeddings)
print("\nTest Accuracy:", accuracy_score(y_test_bertje, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test_bertje, y_pred))

Now we test Naive Bayes (We test GaussianNB, since MultinomialNB does not work for continuous features)

In [None]:

best_var_smoothing = None
best_score = float('-inf')

# Test different hyperparameter values
var_smoothing_values = [10**-11, 10**-10, 10**-9, 10**-8, 10**-7]

for var_smoothing in var_smoothing_values:
    classifier = GaussianNB(var_smoothing=var_smoothing)
    classifier.fit(train_embeddings, y_train_bertje)
    val_score = classifier.score(val_embeddings, y_val_bertje)
    print(f"Var_smoothing: {var_smoothing}, Validation Score: {val_score}")
    if val_score > best_score:
        best_score = val_score
        best_var_smoothing = var_smoothing

print(f"\nBest var_smoothing: {best_var_smoothing}, Best Validation Score: {best_score}")

# Train the final model using the best alpha and fit_prior
final_classifier = GaussianNB(var_smoothing=best_var_smoothing)
final_classifier.fit(train_embeddings, y_train_bertje)

# Test the model
y_pred = final_classifier.predict(test_embeddings)
print("\nTest Accuracy:", accuracy_score(y_test_bertje, y_pred))
print("\nTest Classification Report:\n", classification_report(y_test_bertje, y_pred))

Surprisingly, these models all perform worse than their tf-idf counterparts. <br>
**TO DO**: Rename train_embeddings, val_embeddings, test_embeddings to X_train_bertje, X_val_bertje, X_test_bertje

**Idea**: Once pos_tags are used for classification, test and compare performance for:
> - Bag of Words <br>
> - TF-IDF <br>
> - BERTje <br>
> - mBERT (Multilingual BERT) <br>
> - RobBERT (Dutch RoBERTa model)
> - Word2Vec