## 1. Import libraries and requirements

In [None]:
# Loading required packages
# import spacy
import pandas as pd
import re
# import nltk
from numpy import mean
from numpy import std
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt  
from sklearn.metrics import plot_confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier
from statistics import mean
import json
import csv
import ast
import numpy as np

# from flair.embeddings import WordEmbeddings
# from flair.data import Sentence
# from flair.embeddings import TransformerWordEmbeddings

from tqdm import tqdm

# Set seed
seed = np.random.seed(1)


# Select Spacy model
# Efficiency
# nlp = spacy.load("en_core_web_sm")

# Accuracy
# nlp = spacy.load("en_core_web_trf")

In [None]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
######################################################### LR #########################################################
######################################################################################################################
######################################################################################################################

In [None]:
########################################################### LR - bert

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_bert_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('bert_baseline_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('bert_baseline_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('bert_lr_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('bert_lr_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('bert_cv_results_lr.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('bert_cv_results_lr.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('bert_results_lr.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('bert_results_lr.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('bert_results_lr.csv', index = False)
results_cv.to_csv('bert_cv_results_lr.csv', index = False)

In [None]:
########################################################### LR - elmo

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_elmo_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('elmo_baseline_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_baseline_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=2, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('elmo_lr_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_lr_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('elmo_cv_results_lr.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('elmo_cv_results_lr.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('elmo_results_lr.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('elmo_results_lr.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('elmo_results_lr.csv', index = False)
results_cv.to_csv('elmo_cv_results_lr.csv', index = False)

In [None]:
########################################################### LR - glove

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_glove_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('glove_baseline_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('glove_baseline_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('glove_lr_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('glove_lr_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('glove_cv_results_lr.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('glove_cv_results_lr.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('glove_results_lr.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('glove_results_lr.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('glove_results_lr.csv', index = False)
results_cv.to_csv('glove_cv_results_lr.csv', index = False)

In [None]:
########################################################### LR - word2vec

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_word2vec_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('word2vec_baseline_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('word2vec_baseline_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('word2vec_lr_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('word2vec_lr_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('word2vec_cv_results_lr.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('word2vec_cv_results_lr.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('word2vec_results_lr.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('word2vec_results_lr.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('word2vec_results_lr.csv', index = False)
results_cv.to_csv('word2vec_cv_results_lr.csv', index = False)

In [None]:
########################################################### LR - flair

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_flair_complete.csv', na_values=['nan'])

In [None]:
data.shape

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('flair_baseline_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('flair_baseline_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('flair_lr_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('flair_lr_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('flair_cv_results_lr.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('flair_cv_results_lr.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('flair_results_lr.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('flair_results_lr.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('flair_results_lr.csv', index = False)
results_cv.to_csv('flair_cv_results_lr.csv', index = False)

In [None]:
########################################################### LR - fasttext

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_fasttext_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('fasttext_baseline_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_baseline_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('fasttext_lr_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_lr_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('fasttext_cv_results_lr.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('fasttext_cv_results_lr.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('fasttext_results_lr.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('fasttext_results_lr.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('fasttext_results_lr.csv', index = False)
results_cv.to_csv('fasttext_cv_results_lr.csv', index = False)

In [None]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
######################################################### DT #########################################################
######################################################################################################################
######################################################################################################################

In [None]:
########################################################### DT - bert

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_bert_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('bert_dt_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('bert_dt_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('bert_cv_results_dt.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('bert_cv_results_dt.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('bert_results_dt.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('bert_results_dt.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('bert_results_dt.csv', index = False)
results_cv.to_csv('bert_cv_results_dt.csv', index = False)

In [None]:
########################################################### DT - elmo

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_elmo_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=2, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('elmo_dt_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_dt_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('elmo_cv_results_dt.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('elmo_cv_results_dt.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('elmo_results_dt.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('elmo_results_dt.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('elmo_results_dt.csv', index = False)
results_cv.to_csv('elmo_cv_results_dt.csv', index = False)

In [None]:
########################################################### DT - glove

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_glove_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('glove_dt_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('glove_dt_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('glove_cv_results_dt.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('glove_cv_results_dt.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('glove_results_dt.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('glove_results_dt.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('glove_results_dt.csv', index = False)
results_cv.to_csv('glove_cv_results_dt.csv', index = False)

In [None]:
########################################################### DT - word2vec

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_word2vec_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('word2vec_dt_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('word2vec_dt_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('word2vec_cv_results_dt.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('word2vec_cv_results_dt.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('word2vec_results_dt.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('word2vec_results_dt.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('word2vec_results_dt.csv', index = False)
results_cv.to_csv('word2vec_cv_results_dt.csv', index = False)

In [None]:
########################################################### DT - flair

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_flair_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=2, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('flair_dt_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('flair_dt_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('flair_cv_results_dt.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('flair_cv_results_dt.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('flair_results_dt.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('flair_results_dt.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('flair_results_dt.csv', index = False)
results_cv.to_csv('flair_cv_results_dt.csv', index = False)

In [None]:
########################################################### DT - fasttext

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_fasttext_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('fasttext_dt_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_dt_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('fasttext_cv_results_dt.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('fasttext_cv_results_dt.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('fasttext_results_dt.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('fasttext_results_dt.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('fasttext_results_dt.csv', index = False)
results_cv.to_csv('fasttext_cv_results_dt.csv', index = False)

In [None]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
######################################################### NB #########################################################
######################################################################################################################
######################################################################################################################

In [None]:
########################################################### NB - bert

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_bert_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('bert_nb_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('bert_nb_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('bert_cv_results_nb.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('bert_cv_results_nb.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('bert_results_nb.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('bert_results_nb.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('bert_results_nb.csv', index = False)
results_cv.to_csv('bert_cv_results_nb.csv', index = False)

In [None]:
########################################################### NB - elmo

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_elmo_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=2, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('elmo_nb_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_nb_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('elmo_cv_results_nb.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('elmo_cv_results_nb.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('elmo_results_nb.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('elmo_results_nb.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('elmo_results_nb.csv', index = False)
results_cv.to_csv('elmo_cv_results_nb.csv', index = False)

In [None]:
########################################################### NB - glove

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_glove_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('glove_nb_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('glove_nb_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('glove_cv_results_nb.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('glove_cv_results_nb.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('glove_results_nb.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('glove_results_nb.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('glove_results_nb.csv', index = False)
results_cv.to_csv('glove_cv_results_nb.csv', index = False)

In [None]:
########################################################### NB - word2vec

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_word2vec_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('word2vec_nb_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('word2vec_nb_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('word2vec_cv_results_nb.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('word2vec_cv_results_nb.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('word2vec_results_nb.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('word2vec_results_nb.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('word2vec_results_nb.csv', index = False)
results_cv.to_csv('word2vec_cv_results_nb.csv', index = False)

In [None]:
########################################################### NB - flair

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_flair_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=2, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('flair_nb_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('flair_nb_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('flair_cv_results_nb.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('flair_cv_results_nb.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('flair_results_nb.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('flair_results_nb.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('flair_results_nb.csv', index = False)
results_cv.to_csv('flair_cv_results_nb.csv', index = False)

In [None]:
########################################################### NB - fasttext

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_complete_wordembedding_bert.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_fasttext_complete.csv', na_values=['nan'])

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('fasttext_nb_rev.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_nb_rev.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('fasttext_cv_results_nb.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('fasttext_cv_results_nb.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('fasttext_results_nb.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('fasttext_results_nb.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe
results.to_csv('fasttext_results_nb.csv', index = False)
results_cv.to_csv('fasttext_cv_results_nb.csv', index = False)