# Import necassary libraries

In [None]:
import numpy as np
import pandas as pd
import os
import torch
import wandb
import ast
import re
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
torch.cuda.is_available()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Retrieve Huggingface dataset and prepare it for the model

In [None]:
trainSet = pd.read_parquet('dutch_social dataset\dutch_social-train.parquet')
valSet = pd.read_parquet('dutch_social dataset\dutch_social-validation.parquet')
testSet = pd.read_parquet('dutch_social dataset\dutch_social-test.parquet')
trainSet = trainSet[['full_text','label']]
valSet = valSet[['full_text','label']]
testSet = testSet[['full_text','label']]
trainSet.head()

# Retrieve own dataset and prepare it for the model

In [None]:
dataset = pd.read_csv('test_sentiment_emails_labeled_V4.csv')
dataset = dataset[['text','label']]
dataset = dataset.dropna()
dataset['label'] = dataset['label'].astype(int)
dataset

### Split dataset into train/test set

In [None]:
trainSet, evalSet = train_test_split(dataset, test_size=0.25, stratify=dataset['label'], random_state=42)
valSet, testSet = train_test_split(evalSet, test_size=0.4, stratify=evalSet['label'], random_state=42)

### Define function for calculating class weights + check for usable GPU

In [None]:
#check if CUDA capable gpu is available
cudaAvailable = torch.cuda.is_available()


def calculate_class_weights(trainingDataset):
    weightList = []

    # Get number of labels
    numberOfLabels = len(trainingDataset['label'].unique())
    for i in range(0,numberOfLabels):

        # Calculate class weight with total label count and label count of that class
        weight = len(trainingDataset.index) / (numberOfLabels * len(trainingDataset[trainingDataset['label'] == i].index))
        weightList.append(weight)
    print(weightList)
    return weightList

# Define model/sweep and start training

In [None]:
# Configuration of sweep methods and parameters
sweepConfig = {'method' : 'grid', 'parameters' : {'num_train_epochs' : {'min' : 3, 'max' : 5}, 'learning_rate' : {'values' : [4e-5,5e-5,6e-5]}, 'train_batch_size' : {'values' : [16,32]}}}
sweepID = wandb.sweep(sweepConfig, project='Sentiment_email_sweep_test')

# Define function for calculating F1-score using method from Scikit-learn
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

# Function to start sweep
def train():
    wandb.init()

    #Further configure sweep
    trainArgumentsSweep = {'reprocess_input_data' : True, 'use_multiprocessing' : True, 'overwrite_output_dir' : True, 'use_early_stopping' : True, 'early_stopping_consider_epochs' : True, 'optimizer' : 'AdamW', 'save_model_every_epoch' : False, 'wandb_project' : 'Test sweep'}
    classifierSweep = ClassificationModel("bert","GroNLP/bert-base-dutch-cased", num_labels=len(trainSet['label'].unique()),args=trainArgumentsSweep, use_cuda=True, weight=calculate_class_weights(trainSet), sweep_config=wandb.config)
    classifierSweep.train_model(trainSet)

    # Tabulate results per sweep and log them
    result, model_outputs, wrong_predictions = classifierSweep.eval_model(valSet, f1=f1_multiclass, confusionMatrix=confusion_matrix)
    wandb.log({'mcc' : result['mcc'], 'f1' : result['f1'], 'confusion_matrix' : result['confusionMatrix']})
    wandb.join()

# Start sweep     
wandb.agent(sweepID, train)

### Move best result into sepperate model

In [None]:
wandb.init(project='Sentiment_email_test')
trainArguments = {'reprocess_input_data' : True, 'use_multiprocessing' : True, 'num_train_epochs' : 5, 'overwrite_output_dir' : True, 'use_early_stopping' : True, 'early_stopping_consider_epochs' : True, 'train_batch_size' : 64, 'optimizer' : 'AdamW', 'save_model_every_epoch' : False, 'logging_steps' : 25, 'learning_rate' : 5e-05, 'wandb_kwargs' : {'magic' : True}}
classifier = ClassificationModel("bert","GroNLP/bert-base-dutch-cased", num_labels=len(trainSet['label'].unique()),args=trainArguments, use_cuda=True, weight=calculate_class_weights(trainSet))
classifier.train_model(trainSet)

### Evaluate the resulting model against validation set

In [None]:
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = classifier.eval_model(valSet, f1=f1_multiclass, confusionMatrix=confusion_matrix)
print(result)

### Print out emails that were classified wrong

In [None]:
for i, x in enumerate(wrong_predictions):
    wrong_predictions[i] = str(wrong_predictions[i])
    #wrong_predictions[i] = re.sub("'","\"",wrong_predictions[i])
for i, x in enumerate(wrong_predictions):
    wrong_predictions[i] = ast.literal_eval(wrong_predictions[i])
wrong_predictions_df = pd.DataFrame(wrong_predictions)
wrong_predictions_df = wrong_predictions_df.rename(columns={'guid' : 'testSet_id', 'text_a' : 'text', 'text_b' : 'pred_label', 'label' : 'true_label'})
for i, row in wrong_predictions_df.iterrows():   
    wrong_predictions_df.iloc[i,2] = np.argmax(model_outputs[int(row['testSet_id'])])
wrong_predictions_df

### Evaluate the resulting model against test set

In [None]:
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = classifier.eval_model(testSet, f1=f1_multiclass, confusionMatrix=confusion_matrix)
print(result)

### Print out emails that were classified wrong

In [None]:
for i, x in enumerate(wrong_predictions):
    wrong_predictions[i] = str(wrong_predictions[i])
    #wrong_predictions[i] = re.sub("'","\"",wrong_predictions[i])
for i, x in enumerate(wrong_predictions):
    wrong_predictions[i] = ast.literal_eval(wrong_predictions[i])
wrong_predictions_df = pd.DataFrame(wrong_predictions)
wrong_predictions_df = wrong_predictions_df.rename(columns={'guid' : 'testSet_id', 'text_a' : 'text', 'text_b' : 'pred_label', 'label' : 'true_label'})
for i, row in wrong_predictions_df.iterrows():   
    wrong_predictions_df.iloc[i,2] = np.argmax(model_outputs[int(row['testSet_id'])])
wrong_predictions_df

### Save model localy

# Conclusion:
BERTje is capable of properly classifying mails based on their sentiment. The difference in performance between both datasets is caused by their size, which is simmilar beheviour with subject classification.

# Sources:
- https://huggingface.co/GroNLP/bert-base-dutch-cased
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html
- https://huggingface.co/docs/transformers/perf_train_gpu_one
- https://www.analyticsvidhya.com/blog/2020/10/improve-class-imbalance-class-weights

# License

BERTje: de Vries, W., van Cranenburgh, A., Bisazza, A., Caselli, T., van Noord, G., & Nissim, M. (2019, December 19). BERTje: A Dutch BERT Model. Groningen, Groningen, Nederland.