In [1]:
import random

import instancelib as il
from instancelib.machinelearning.sklearn import SkLearnClassifier
from instancelib.typehints.typevars import KT, LT
from instancelib.utils.func import list_unzip

from typing import Any, FrozenSet, Sequence, Tuple

import pandas as pd
import numpy as np

from instancelib.analysis.base import classifier_performance
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sbert import PretrainedSentenceBERTVectorizer
from stratified import stratified_train_test
from synthesizer import PreSynthesized

from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, fbeta_score


from matplotlib import pyplot as plt

In [2]:
random.seed(10)

In [3]:
df = pd.read_csv("active_learning_data_to_test_imbalanced_new.csv")
tweet_env = il.pandas_to_env_with_id(df, "identifier", "clean_post", "set")

In [4]:
lookup_table = pd.read_csv("synthetic_data_ivae_new.csv")

In [5]:
tfidf = il.TextInstanceVectorizer(il.SklearnVectorizer(TfidfVectorizer(max_features=5000)))
sbert_vec = il.TextInstanceVectorizer(PretrainedSentenceBERTVectorizer("pdelobelle/robbert-v2-dutch-base"))

In [6]:
# il.vectorize(tfidf, tweet_env)
il.vectorize(sbert_vec, tweet_env, fit = False, chunk_size=20000)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/216 [00:00<?, ?it/s]

In [7]:
# We create a train set of 70 %. 
# The remainder will be used as evaluation
train, test = stratified_train_test(tweet_env, tweet_env.dataset, tweet_env.labels, 0.70)

In [8]:
print(len(train), len(test))

32830 14070


In [9]:
logreg = LogisticRegression(max_iter=1000)

In [10]:
def sklearn_truth_pred(model: SkLearnClassifier[Any, KT, Any, Any, LT],
                       predictions: Sequence[Tuple[KT, FrozenSet[LT]]],
                       truth: il.LabelProvider[KT, LT]) -> Tuple[np.ndarray, np.ndarray]:
    keys, preds = list_unzip(predictions)
    truths = [truth.get_labels(key) for key in keys]
    y_pred = model.encoder.encode_batch(preds)
    y_true = model.encoder.encode_batch(truths)
    return y_true, y_pred

In [11]:
ml_synthesizer_0 = PreSynthesized(tweet_env, lookup_table, sbert_vec, 10)

In [12]:
train_generated_0 = ml_synthesizer_0(train)

Batches:   0%|          | 0/307 [00:00<?, ?it/s]

In [13]:
len(train_generated_0)

9800

In [14]:
all_train_generated_0 = tweet_env.combine(train, train_generated_0)

In [15]:
model_logreg_0 = il.SkLearnVectorClassifier.build(logreg, tweet_env)

In [16]:
model_logreg_0.fit_provider(all_train_generated_0, tweet_env.labels)
performance_logreg_0 = classifier_performance(model_logreg_0, test, tweet_env.labels)

In [17]:
print(f"Covid F1 score: {performance_logreg_0['covid'].f1}")
print(f"Covid F2 score: {performance_logreg_0['covid'].f_beta(2)}")
print(f"Covid Recall: {performance_logreg_0['covid'].recall}")
print(f"Covid Precision: {performance_logreg_0['covid'].precision}")
print(f"Covid Accuracy: {performance_logreg_0['covid'].accuracy}")

print(f"Affair F1 score: {performance_logreg_0['affair'].f1}")
print(f"Affair F2 score: {performance_logreg_0['affair'].f_beta(2)}")
print(f"Affair Recall: {performance_logreg_0['affair'].recall}")
print(f"Affair Precision: {performance_logreg_0['affair'].precision}")
print(f"Affair Accuracy: {performance_logreg_0['affair'].accuracy}")

Covid F1 score: 0.5622593068035944
Covid F2 score: 0.537027954879843
Covid Recall: 0.5214285714285715
Covid Precision: 0.6100278551532033
Covid Accuracy: 0.9757640369580668
Affair F1 score: 0.9875370052264172
Affair F2 score: 0.9888597736821303
Affair Recall: 0.9897435897435898
Affair Precision: 0.9853402377652979
Affair Accuracy: 0.9757640369580668


In [18]:
predictions = model_logreg_0.predict(test)
y_true, y_pred = sklearn_truth_pred(model_logreg_0, predictions, tweet_env.labels)
    
print('Confusion matrix \n', confusion_matrix(y_true, y_pred))
print('ROC_AUC', roc_auc_score(y_true, y_pred))
print('MCC', matthews_corrcoef(y_true, y_pred))
print('F2', fbeta_score(y_true, y_pred, beta=2))

Confusion matrix 
 [[13510   140]
 [  201   219]]
ROC_AUC 0.7555860805860806
MCC 0.5516661986655502
F2 0.5370279548798431


In [19]:
df = pd.DataFrame(list(zip(y_true, y_pred)), columns=['true', 'pred'])
df.to_csv('logreg_ivae_10_attempt2.csv', index=False)