# Experiment 3 Machine Learning baseline

### Imports

In [1]:
import random

import instancelib as il
from instancelib.machinelearning.sklearn import SkLearnClassifier
from instancelib.typehints.typevars import KT, LT
from instancelib.utils.func import list_unzip

from typing import Any, FrozenSet, Sequence, Tuple

import pandas as pd
import numpy as np

from instancelib.analysis.base import classifier_performance
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sbert import PretrainedSentenceBERTVectorizer
from stratified import stratified_train_test
from synthesizer import PreSynthesized

from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, fbeta_score


from matplotlib import pyplot as plt

In [2]:
random.seed(20)

### Define functions

In [14]:
def sklearn_truth_pred(model: SkLearnClassifier[Any, KT, Any, Any, LT],
                       predictions: Sequence[Tuple[KT, FrozenSet[LT]]],
                       truth: il.LabelProvider[KT, LT]) -> Tuple[np.ndarray, np.ndarray]:
    '''Return true and predicted labels from test set'''
    keys, preds = list_unzip(predictions)
    truths = [truth.get_labels(key) for key in keys]
    y_pred = model.encoder.encode_batch(preds)
    y_true = model.encoder.encode_batch(truths)
    return y_true, y_pred

### Load data

In [3]:
df = pd.read_csv("../../data/active_learning_data_to_test_imbalanced_with_identifiers.csv")
tweet_env = il.pandas_to_env_with_id(df, "identifier", "clean_post", "set")

### Preparation

*Vectorize data with RobBERT*

In [5]:
sbert_vec = il.TextInstanceVectorizer(PretrainedSentenceBERTVectorizer("pdelobelle/robbert-v2-dutch-base"))

In [6]:
il.vectorize(sbert_vec, tweet_env, fit = False, chunk_size=20000)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/216 [00:00<?, ?it/s]

*Split data in train and testset*

In [7]:
# We create a train set of 70 %. 
# The remainder will be used as evaluation
train, test = stratified_train_test(tweet_env, tweet_env.dataset, tweet_env.labels, 0.70)

In [8]:
print(len(train), len(test))

32830 14070


*Create synthesizer*

In [9]:
#Makes sure X amount of synthetic texts will be added per labeled COVID-19 tweet.
ml_synthesizer = PreSynthesized(tweet_env, lookup_table, sbert_vec, 0)

*Create trainset for ML baseline*

In [10]:
al_baseline_simulation = tweet_env.create_bucket(random.sample(train.key_list, 3895))

In [11]:
al_baseline_generated = ml_synthesizer(al_baseline_simulation)

In [12]:
al_baseline_train = tweet_env.combine(al_baseline_generated, al_baseline_simulation)

*Initialize classifier*

In [13]:
logreg = LogisticRegression(max_iter=1000)
model_logreg = il.SkLearnVectorClassifier.build(logreg, tweet_env)

### Train and evaluate model

In [18]:
model_logreg.fit_provider(al_baseline_train, tweet_env.labels)
performance = classifier_performance(model_logreg, test, tweet_env.labels)
    
predictions = model_logreg.predict(test)
y_true, y_pred = sklearn_truth_pred(model_logreg, predictions, tweet_env.labels)
    
print(confusion_matrix(y_true, y_pred))
print(roc_auc_score(y_true, y_pred))
print(matthews_corrcoef(y_true, y_pred))
print(fbeta_score(y_true, y_pred, beta=2))

[[13628    22]
 [  250   170]]
0.7015750915750915
0.5913488769879651
0.45405982905982906
