## 1. Import libraries and requirements

In [None]:
# %pip install flair
%pip install allennlp==0.9.0

Collecting allennlp==0.9.0
  Using cached allennlp-0.9.0-py3-none-any.whl (7.6 MB)
Collecting pytorch-transformers==1.1.0
  Using cached pytorch_transformers-1.1.0-py3-none-any.whl (158 kB)
Collecting flask>=1.0.2
  Using cached Flask-2.0.2-py3-none-any.whl (95 kB)
Collecting spacy<2.2,>=2.1.0
  Using cached spacy-2.1.9-cp38-cp38-linux_x86_64.whl
Collecting jsonpickle
  Using cached jsonpickle-2.1.0-py2.py3-none-any.whl (38 kB)
Collecting jsonnet>=0.10.0
  Using cached jsonnet-0.18.0-cp38-cp38-linux_x86_64.whl
Collecting editdistance
  Using cached editdistance-0.6.0-cp38-cp38-manylinux2010_x86_64.whl (286 kB)
Collecting unidecode
  Using cached Unidecode-1.3.2-py3-none-any.whl (235 kB)
Collecting responses>=0.7
  Using cached responses-0.17.0-py2.py3-none-any.whl (38 kB)
Collecting tensorboardX>=1.2
  Using cached tensorboardX-2.4.1-py2.py3-none-any.whl (124 kB)
Collecting nltk
  Using cached nltk-3.6.7-py3-none-any.whl (1.5 MB)
Collecting flask-cors>=3.0.7
  Using cached Flask_Cors-3

In [3]:
# Loading required packages
# import spacy
import pandas as pd
import re
# import gensim
# import gensim.downloader
# from gensim.models import Word2Vec
# from gensim.test.utils import common_texts
# from gensim.models import Word2Vec
# from gensim.models.phrases import Phrases, Phraser
# import nltk
from numpy import mean
from numpy import std
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt  
from sklearn.metrics import plot_confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from statistics import mean
import json
import csv
import ast
import numpy as np

from flair.embeddings import WordEmbeddings
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings, ELMoEmbeddings

from tqdm import tqdm

# Set seed
seed = np.random.seed(1)


# Select Spacy model
# Efficiency
# nlp = spacy.load("en_core_web_sm")

# Accuracy
# nlp = spacy.load("en_core_web_trf")

In [None]:
# Define columns and read annotated data
columns = ['Token', 'Label', 'pos', 'ent_type', 'is_alpha', 'is_ascii', 'is_digit', 'is_lower', 'is_upper', 'is_title', 'is_punct', 'is_space', 'like_num', 'is_oov', 'is_stop', 'like_num', 'lang', 'sentiment']
data = pd.read_csv('FULL_Annotation_data_output.tsv', sep='\t', nrows=50, header = None)

In [None]:
%%time

# Generate linguistic features for each token
def feature_extraction(input_column):
    features = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
    for i in input_column:
        i = str(i)
        doc = nlp(i)
        word = i
        for token in doc:
            features[0].append(token.pos)
            features[1].append(token.ent_type)
            features[2].append(token.is_alpha)
            features[3].append(token.is_ascii)
            features[4].append(token.is_digit)
            features[5].append(token.is_lower)
            features[6].append(token.is_upper)
            features[7].append(token.is_title)
            features[8].append(token.is_punct)
            features[9].append(token.is_space)
            features[10].append(token.like_num)
            features[11].append(token.is_oov)
            features[12].append(token.is_stop)
            features[13].append(token.lang)
            features[14].append(token.sentiment)
            features[15].append(len(word))
    return features

features = feature_extraction(data[0])

In [None]:
%%time

# Convert list to dataframe
features = pd.DataFrame(features)

# We need to transpose this dataframe first
features = features.transpose()

# We concat the annotated data with the linguistic features
data = pd.concat([data, features], axis=1)
data.columns = ['Token', 'Label', 'pos', 'ent_type', 'is_alpha', 'is_ascii', 'is_digit', 'is_lower', 'is_upper', 'is_title', 'is_punct', 'is_space', 'like_num', 'is_oov', 'is_stop', 'lang', 'sentiment', 'word_length']

data.head(10)

In [21]:
# Optionally the data can be saved to create a checkpoint

# data.to_csv('data_features_full_dataset.csv', index = False)

data = pd.read_csv('complete_data_features_full_dataset.csv', header = 0, skiprows=range(1, 500000), nrows = 250000)

In [22]:
# init word embedding
embedding = ELMoEmbeddings('medium')

In [None]:
%%time

# Generate word embeddings for each token

def elmo_wordembed(input_column):
    elmo_result =[]
    counter = 0
    for i in tqdm(input_column):
        counter = counter +1
        try:
            i = str(i)
            token = Sentence(i)
            embedding.embed(token)
            for token in token:
                result_array = token.embedding
            result_list = result_array.tolist()
            elmo_result.append(result_list)
        except KeyError:
            elmo_result.append(np.nan)
        except TypeError: 
            elmo_result.append(np.nan)
        except IndexError:
            elmo_result.append(np.nan)

    return elmo_result


# ELMo_Word_Embeddings
word_embedding = elmo_wordembed(data['Token'])

 69%|██████▉   | 173525/250000 [10:33:37<6:59:39,  3.04it/s]

In [None]:
%%time

# Replace empty values in the list of word embeddings for words 
# we couldn't generate a word embedding for

word_embedding_complete = []

for i in tqdm(word_embedding):
    try:
        if len(i) == 0:
            i = []
        else:
            i = i
        word_embedding_complete.append(i)
        
    except TypeError:
        i = []
        word_embedding_complete.append(i)

In [None]:
%%time

# Convert list to dataframe
word_embedding_df = pd.DataFrame(word_embedding_complete)
word_embedding_series = word_embedding_df.apply(pd.Series)

# We concat the annotated data with the linguistic features
data = pd.concat([data, word_embedding_series], axis=1)

In [None]:
data.head(10)

In [None]:
%%time

# Optionally the data can be saved to create a checkpoint

data.to_csv('data_features_full_wordembedding_elmo_500_750.csv', index = False)

# data = pd.read_csv('data_features_full_complete_wordembedding_elmo.csv', na_values=['nan'])

data.head(10)

In [None]:
###############################

In [None]:
data = pd.read_csv('complete_data_features_full_dataset.csv', header = 0, skiprows=range(1, 750000), nrows = 250000)

In [None]:
# init word embedding
embedding = ELMoEmbeddings('medium')

In [None]:
%%time

# Generate word embeddings for each token

def elmo_wordembed(input_column):
    elmo_result =[]
    counter = 0
    for i in tqdm(input_column):
        counter = counter +1
        try:
            i = str(i)
            token = Sentence(i)
            embedding.embed(token)
            for token in token:
                result_array = token.embedding
            result_list = result_array.tolist()
            elmo_result.append(result_list)
        except KeyError:
            elmo_result.append(np.nan)
        except TypeError: 
            elmo_result.append(np.nan)
        except IndexError:
            elmo_result.append(np.nan)

    return elmo_result


# ELMo_Word_Embeddings
word_embedding = elmo_wordembed(data['Token'])

In [None]:
%%time

# Replace empty values in the list of word embeddings for words 
# we couldn't generate a word embedding for

word_embedding_complete = []

for i in tqdm(word_embedding):
    try:
        if len(i) == 0:
            i = []
        else:
            i = i
        word_embedding_complete.append(i)
        
    except TypeError:
        i = []
        word_embedding_complete.append(i)

In [None]:
%%time

# Convert list to dataframe
word_embedding_df = pd.DataFrame(word_embedding)
word_embedding_series = word_embedding_df[0].apply(pd.Series)

# We concat the annotated data with the linguistic features
data = pd.concat([data, word_embedding_series], axis=1)

In [None]:
data.to_csv('data_features_full_wordembedding_elmo_750_1000.csv', index = False)

In [None]:
del(data)
del(embedding)
del(word_embedding)
del(word_embedding_df)
del(word_embedding_series)

In [None]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

In [None]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Delete data to save memory

del(data)

In [None]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [None]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('elmo_baseline.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_baseline.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('elmo_lr.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_lr.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('elmo_dt.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_dt.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)

In [None]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('elmo_nb.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('elmo_nb.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [None]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

In [None]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('elmo_cv_results.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('elmo_cv_results.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

In [None]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('elmo_results.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('elmo_results.pckl', 'rb')
results = pickle.load(f)
f.close()

In [None]:
# Save results dataframe

results.to_csv('elmo_results.csv', index = False)
results_cv.to_csv('elmo_cv_results.csv', index = False)