In [39]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import spacy
import random
import numpy as np
import _pickle as cPickle
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from collections import defaultdict
from os import getcwd
from os.path import join, dirname

PATH_REPO = dirname(getcwd())
PATH_UTILS = join(PATH_REPO, 'utils')
PATH_DATA = join(PATH_REPO, 'data')
PATH_MODELS = join(PATH_REPO, 'models')

import sys
sys.path.append(PATH_UTILS)

pd.set_option('max_colwidth', None)

random.seed(1)
spacy.util.fix_random_seed(1)


from training import create_word2vec_dataset

from sklearn.metrics import precision_recall_fscore_support as score

nlp = spacy.load('en_core_web_lg')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
train = pd.read_csv(join(PATH_DATA, "grat_train_set.csv"), engine='python')
train = train.sample(frac=1).reset_index(drop=True)
train = train.fillna(0)
train['label'] = train['label'].astype(int)

test = pd.read_csv(join(PATH_DATA, 'test_set_proc.csv'), names=['text', 'label'], skiprows=1)
test['label'] = np.where(test['label']=='Gratitude', 1, 0)


In [33]:
train

Unnamed: 0,text,label
0,Truly privileged to have been trusted to tell this story- huge thanks to USER &amp; USER USER USER USER - but so many more people have been involved USER USER bbcpn ðŸ‘‡ðŸ‘‡ðŸ‘‡,1
1,USER USER USER Go and close Jaguar Land Rover and its suppliers. Clearly breaking covid guidelines ðŸ‘,0
2,"TOMMO: Thanks, and you keep scribing too! Looking forward to hearing tales of you and your motor bike adventures, although I have heard that it has been a cold and wet winter in France, which is the same as it has been in the UK! And yes, it was indeed a miracle that I did not catch the COVID plague",1
3,"Donâ€™t know how itâ€™s happened, but actually having the best time of my life right now, despite the virus and restrictions. Lifeâ€™s what you make it and Iâ€™m most certainly making the most of my lot and creating magic from hardly anything. Start where you are, use what youâ€™ve got. ðŸ”¥",0
4,"Itâ€™s important to understand that what one says matters. Even more so if you have a platform, publisher, or influence. Check out the stream of utter provacative drivel spouted by USER last year (which he has now deleted, like an ass)",0
...,...,...
16136,(THREAD) COVID-19 UPDATE ~ SAFETY FIRST Everyone here at The Banner Hub wishes you a safe and well start to 2021 â€“ we just wanted to let you know what weâ€™re doing to operate safely as we begin another period of lockdown.,0
16137,"Wonderful service received today USER , 1st Covid vaccination received, painless and professionally given! Thank you.",1
16138,"Because of Corona, I have been staying much more time at home than earlier. Was intrigued by Home Theatre system and upgrading from my Samsung soundbar. I have been trying to educate myself on the same. I like Floor Standing speakers and wanted to hear from folks who have set up such a system. I read about Klipsch, Dali, Jamo, Denon etc but don't know enough about them. Can you share your advise on: Which floor standing speakers to consider? Which Amplifier would be recommended (same brand or another one). I have a big TV which I use to see movies and Netflix. Thank you for advise.",1
16139,"â€œMaking sure children get proper, healthy meals should be the top priority for ministers, not listening to lobbyists from the catering industry.â€ More by USER USER education educationnews",0


In [4]:
X_train, Y_train = create_word2vec_dataset(train, nlp)
# X_val, Y_val = create_word2vec_dataset(validation, nlp)
X_test = create_word2vec_dataset(test, nlp, False)

In [5]:
classifier = svm.SVC(C=0.5, kernel='linear', degree=3, gamma='auto')
classifier.fit(X_train, Y_train)

SVC(C=0.5, gamma='auto', kernel='linear')

In [6]:
predictions = classifier.predict(X_test)
Y_test = test['label'].values.tolist()
print("SVM Accuracy Score -> ",accuracy_score(predictions, Y_test)*100)

SVM Accuracy Score ->  49.32821497120921


In [7]:
with open(join(PATH_MODELS,'gratitude_classifier.pkl'), 'wb') as fid:
    cPickle.dump(classifier, fid)    

In [8]:
precision, recall, fscore, support = score(Y_test, predictions, average='macro')
print("Precision:\t", precision, "\n", "Recall:\t", recall, "\n", "F1:\t", fscore)

Precision:	 0.698823948681397 
 Recall:	 0.5869003115264797 
 F1:	 0.4544502617801047
