## 1. Import libraries and requirements

In [1]:
# Loading required packages
import spacy
import pandas as pd
import re
# import gensim
# import gensim.downloader
# from gensim.models import Word2Vec
# from gensim.test.utils import common_texts
# from gensim.models import Word2Vec
# from gensim.models.phrases import Phrases, Phraser
# import nltk
from numpy import mean
from numpy import std
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt  
from sklearn.metrics import plot_confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from statistics import mean
import json
import csv
import ast
import numpy as np

from flair.embeddings import WordEmbeddings
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings
from flair.embeddings import FastTextEmbeddings

import fasttext
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English

from tqdm import tqdm

# Set seed
seed = np.random.seed(1)


# Select Spacy model
# Efficiency
nlp = spacy.load("en_core_web_sm")

# Accuracy
# nlp = spacy.load("en_core_web_trf")

In [2]:
# Define columns and read annotated data
columns = ['Token', 'Label', 'pos', 'ent_type', 'is_alpha', 'is_ascii', 'is_digit', 'is_lower', 'is_upper', 'is_title', 'is_punct', 'is_space', 'like_num', 'is_oov', 'is_stop', 'like_num', 'lang', 'sentiment']
data = pd.read_csv('FULL_Annotation_data_output.tsv', sep='\t', nrows=50, header = None)

In [3]:
%%time

# Generate linguistic features for each token
def feature_extraction(input_column):
    features = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
    for i in tqdm(input_column):
        i = str(i)
        doc = nlp(i)
        word = i
        for token in doc:
            features[0].append(token.pos)
            features[1].append(token.ent_type)
            features[2].append(token.is_alpha)
            features[3].append(token.is_ascii)
            features[4].append(token.is_digit)
            features[5].append(token.is_lower)
            features[6].append(token.is_upper)
            features[7].append(token.is_title)
            features[8].append(token.is_punct)
            features[9].append(token.is_space)
            features[10].append(token.like_num)
            features[11].append(token.is_oov)
            features[12].append(token.is_stop)
            features[13].append(token.lang)
            features[14].append(token.sentiment)
            features[15].append(len(word))
    return features

features = feature_extraction(data[0])

100%|██████████| 50/50 [00:00<00:00, 195.52it/s]

CPU times: user 255 ms, sys: 1.87 ms, total: 257 ms
Wall time: 259 ms





In [4]:
%%time

# Convert list to dataframe
features = pd.DataFrame(features)

# We need to transpose this dataframe first
features = features.transpose()

# We concat the annotated data with the linguistic features
data = pd.concat([data, features], axis=1)
data.columns = ['Token', 'Label', 'pos', 'ent_type', 'is_alpha', 'is_ascii', 'is_digit', 'is_lower', 'is_upper', 'is_title', 'is_punct', 'is_space', 'like_num', 'is_oov', 'is_stop', 'lang', 'sentiment', 'word_length']

data.head(10)

CPU times: user 7.85 ms, sys: 0 ns, total: 7.85 ms
Wall time: 7.71 ms


Unnamed: 0,Token,Label,pos,ent_type,is_alpha,is_ascii,is_digit,is_lower,is_upper,is_title,is_punct,is_space,like_num,is_oov,is_stop,lang,sentiment,word_length
0,food,O,92,0,True,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,4
1,a,O,95,0,True,True,False,True,False,False,False,False,False,True,True,14626626061804382878,0.0,1
2,fast,O,86,0,True,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,4
3,grow,O,100,0,True,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,4
4,-winne,O,97,0,False,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,6
5,online,O,86,0,True,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,6
6,food,O,92,0,True,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,4
7,community,O,92,0,True,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,9
8,and,O,89,0,True,True,False,True,False,False,False,False,False,True,True,14626626061804382878,0.0,3
9,crowd,O,92,0,True,True,False,True,False,False,False,False,False,True,False,14626626061804382878,0.0,5


In [5]:
# Optionally the data can be saved to create a checkpoint

data.to_csv('data_features_full_dataset.csv', index = False)

data = pd.read_csv('data_features_full_dataset.csv', header = 0)

In [6]:
# init word embedding
embedding = FastTextEmbeddings('cc.en.300.bin')

KeyboardInterrupt: 

In [None]:
%%time

# Generate word embeddings for each token

def fasttext_wordembed(input_column):
    fasttext_result =[]
    counter = 0
    for i in tqdm(input_column):
        counter = counter +1
        try:
            i = str(i)
            token = Sentence(i)
            embedding.embed(token)
            for token in token:
                result_array = token.embedding
            result_list = result_array.tolist()
            fasttext_result.append(result_list)
        except KeyError:
            fasttext_result.append(np.nan)
        except TypeError: 
            fasttext_result.append(np.nan)
        except IndexError:
            fasttext_result.append(np.nan)

    return fasttext_result


# fasttext_Word_Embeddings
word_embedding = fasttext_wordembed(data['Token'])

In [None]:
%%time

# Replace empty values in the list of word embeddings for words 
# we couldn't generate a word embedding for

word_embedding_complete = []

for i in tqdm(word_embedding):
    try:
        if len(i) == 0:
            i = []
        else:
            i = i
        word_embedding_complete.append(i)
        
    except TypeError:
        i = []
        word_embedding_complete.append(i)

In [54]:
%%time

# Convert list to dataframe
word_embedding_df = pd.DataFrame(word_embedding_complete)
word_embedding_series = word_embedding_df.apply(pd.Series)

# We concat the annotated data with the linguistic features
data = pd.concat([data, word_embedding_series], axis=1)

CPU times: user 54.5 ms, sys: 113 µs, total: 54.6 ms
Wall time: 54.9 ms


In [55]:
%%time

# Optionally the data can be saved to create a checkpoint

data.to_csv('data_features_full_wordembedding_fasttext.csv', index = False)

data = pd.read_csv('data_features_full_wordembedding_fasttext.csv', na_values=['nan'])

CPU times: user 47.3 ms, sys: 1.88 ms, total: 49.2 ms
Wall time: 57.9 ms


In [56]:
%%time

# Replace NaN values with a "0"

data = data.replace(np.nan, '0', regex=True)

# We drop the token, as it is no longer needed for prediction
data.drop('Token', axis=1, inplace=True)
data.head(10)

CPU times: user 2.34 ms, sys: 67 µs, total: 2.41 ms
Wall time: 2.23 ms


Unnamed: 0,Label,pos,ent_type,is_alpha,is_ascii,is_digit,is_lower,is_upper,is_title,is_punct,...,290,291,292,293,294,295,296,297,298,299
0,O,92,0,True,True,False,True,False,False,False,...,-0.039578,0.047575,-0.058627,0.046259,0.023654,0.051156,-0.025376,0.194072,0.064826,-0.043009
1,O,90,0,True,True,False,True,False,False,False,...,0.428693,0.140281,-0.160824,0.140865,0.010901,-0.006161,-0.024191,0.534851,-0.05581,-0.016598
2,O,86,0,True,True,False,True,False,False,False,...,0.088452,0.04998,-0.188911,0.00801,0.005692,0.030967,0.088043,0.132366,-0.068855,-0.109943
3,O,100,0,True,True,False,True,False,False,False,...,0.079889,-0.001241,-0.07871,0.056499,-0.040299,0.026815,0.016724,0.003833,0.141033,0.139362
4,O,97,0,False,True,False,True,False,False,False,...,0.02626,-0.142785,-0.139935,0.030903,0.135329,-0.037944,-0.159724,0.062729,-0.032213,-0.062005
5,O,86,0,True,True,False,True,False,False,False,...,-0.011911,0.032228,-0.046799,-0.049946,-0.098857,0.027563,0.032633,0.078089,-0.021893,0.01068
6,O,92,0,True,True,False,True,False,False,False,...,-0.039578,0.047575,-0.058627,0.046259,0.023654,0.051156,-0.025376,0.194072,0.064826,-0.043009
7,O,92,0,True,True,False,True,False,False,False,...,0.001633,0.01228,-0.018961,0.068939,-0.038316,0.001767,0.018424,0.039195,0.033368,0.054643
8,O,89,0,True,True,False,True,False,False,False,...,0.060722,-0.035507,-0.02382,-0.027755,-0.026023,-0.038051,-0.021103,-0.011591,-0.041129,0.036252
9,O,100,0,True,True,False,True,False,False,False,...,0.037642,-0.09254,-0.01317,0.092988,0.080099,-0.0308,0.069824,0.092307,-0.001399,-0.007892


In [57]:
%%time

# 80% / 20% split
# Train, Test = train_test_split(data1, test_size=0.2, shuffle=False)

X = data.drop(['Label'],axis=1).values # independant features
y = data['Label'].values # dependant variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

CPU times: user 3.83 ms, sys: 0 ns, total: 3.83 ms
Wall time: 3.38 ms


In [58]:
# Delete data to save memory

del(data)

In [59]:
%%time

max_iterations = 1000000000

classifier = []
accuracy = []
precision = []
recall = []
f1 = []

cv_classifier = []
cv_precision = []
cv_recall = []
cv_f1 = []

CPU times: user 27 µs, sys: 5 µs, total: 32 µs
Wall time: 35 µs


In [60]:
%%time
# Baseline

clf = DummyClassifier(strategy="uniform", random_state=seed)


# Model fit
clf.fit(X_train, y_train)

CPU times: user 517 µs, sys: 0 ns, total: 517 µs
Wall time: 465 µs


DummyClassifier(strategy='uniform')

In [61]:
# Optionally the data can be saved to create a checkpoint - Baseline

import pickle

f = open('fasttext_baseline.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_baseline.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [75]:
# Evaluation Baseline

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("Baseline")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1_score: 1.0


In [76]:
%%time

# Logistic Regression

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = LogisticRegression(solver='newton-cg', random_state=seed, max_iter=max_iterations)
scores_LR = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
LR_avg_precision = mean(scores_LR['test_precision_macro'])
LR_avg_recall = mean(scores_LR['test_recall_macro'])
LR_avg_f1 = mean(scores_LR['test_f1_macro'])

cv_classifier.append("LR")
cv_precision.append(LR_avg_precision)
cv_recall.append(LR_avg_recall)
cv_f1.append(LR_avg_f1)

# Model fit
clf.fit(X_train, y_train)



CPU times: user 377 ms, sys: 1.8 s, total: 2.17 s
Wall time: 891 ms




LogisticRegression(max_iter=1000000000, solver='newton-cg')

In [77]:
# Optionally the data can be saved to create a checkpoint - LR

import pickle

f = open('fasttext_lr.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_lr.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [78]:
# Evaluation Logistic Regression

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("LR")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1_score: 1.0


In [79]:
%%time

# Decision Tree

# Cross validation
scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = DecisionTreeClassifier(random_state=seed)
scores_DT = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
DT_avg_precision = mean(scores_DT['test_precision_macro'])
DT_avg_recall = mean(scores_DT['test_recall_macro'])
DT_avg_f1 = mean(scores_DT['test_f1_macro'])

cv_classifier.append("DT")
cv_precision.append(DT_avg_precision)
cv_recall.append(DT_avg_recall)
cv_f1.append(DT_avg_f1)

# Model fit
clf.fit(X_train, y_train)



CPU times: user 35.8 ms, sys: 2.24 ms, total: 38.1 ms
Wall time: 829 ms


DecisionTreeClassifier()

In [80]:
# Optionally the data can be saved to create a checkpoint - DT

import pickle

f = open('fasttext_dt.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_dt.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [81]:
# Evaluation Decision Tree

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("DT")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

Accuracy: 0.9
Precision: 0.5
Recall: 0.45
F1_score: 0.4736842105263158


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
%%time

# Naive Bayes

scoring = ['precision_macro', 'recall_macro', "f1_macro"]
clf = GaussianNB()
scores_NB = cross_validate(clf, X_train, y_train, scoring = scoring, cv=10, n_jobs=-1)
NB_avg_precision = mean(scores_NB['test_precision_macro'])
NB_avg_recall = mean(scores_NB['test_recall_macro'])
NB_avg_f1 = mean(scores_NB['test_f1_macro'])

cv_classifier.append("NB")
cv_precision.append(NB_avg_precision)
cv_recall.append(NB_avg_recall)
cv_f1.append(NB_avg_f1)

# Model fit
clf.fit(X_train, y_train)



CPU times: user 35.5 ms, sys: 1.88 ms, total: 37.4 ms
Wall time: 760 ms


GaussianNB()

In [83]:
# Optionally the data can be saved to create a checkpoint - NB

import pickle

f = open('fasttext_nb.pckl', 'wb')
pickle.dump(clf, f)
f.close()

f = open('fasttext_nb.pckl', 'rb')
clf = pickle.load(f)
f.close()

In [84]:
# Evaluation Naive Bayes

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro',zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1_score:", f1_score(y_test, y_pred, average='macro'))

classifier.append("NB")
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred, average='macro',zero_division=0))
recall.append(recall_score(y_test, y_pred, average='macro'))
f1.append(f1_score(y_test, y_pred, average='macro'))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1_score: 1.0


In [85]:
results_cv = pd.DataFrame(zip(cv_classifier, cv_precision, cv_recall, cv_f1), columns = ['CV_Classifier', 'CV_Precision', 'CV_Recall', 'CV_F1-score'])
results_cv = results_cv.sort_values(by = "CV_F1-score", ascending = False)

f = open('fasttext_cv_results.pckl', 'wb')
pickle.dump(results_cv, f)
f.close()

f = open('fasttext_cv_results.pckl', 'rb')
results_cv = pickle.load(f)
f.close()

  CV_Classifier  CV_Precision  CV_Recall  CV_F1-score
1            DT        0.8875     0.8875     0.885714
4            DT        0.7875     0.7625     0.771429
2            NB        0.6875     0.6250     0.647619
5            NB        0.6875     0.6250     0.647619
0            LR           NaN        NaN          NaN
3            LR           NaN        NaN          NaN


In [86]:
results = pd.DataFrame(zip(classifier, accuracy, precision, recall, f1), columns = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
results = results.sort_values(by = "F1-score", ascending = False)

f = open('fasttext_results.pckl', 'wb')
pickle.dump(results, f)
f.close()

f = open('fasttext_results.pckl', 'rb')
results = pickle.load(f)
f.close()

  Classifier  Accuracy  Precision  Recall  F1-score
1         LR       1.0        1.0    1.00  1.000000
3         NB       1.0        1.0    1.00  1.000000
4   Baseline       1.0        1.0    1.00  1.000000
5         LR       1.0        1.0    1.00  1.000000
7         NB       1.0        1.0    1.00  1.000000
2         DT       0.9        0.5    0.45  0.473684
6         DT       0.9        0.5    0.45  0.473684
0   Baseline       0.4        0.5    0.20  0.285714


In [87]:
# Save results dataframe
results.to_csv('fasttext_results.csv', index = False)
results_cv.to_csv('fasttext_cv_results.csv', index = False)