### read data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import re

In [3]:
dataset = pd.read_csv('/media/shashanks/3E0CD8C50CD878FB/CDAC/My work/CDAC PROJECTS/MACHINE LEARNING/Emotional-Support-Chatbot/Data Collection/emotions_dataset.csv')
dataset.head()


Unnamed: 0.1,Unnamed: 0,Post_id,Title,Text,emotion,subreddit
0,0,jhc5fc,"I’ve wasted so much time being depressed, that...",I can’t imagine how many days I have wasted to...,depressed,depression
1,1,k47q9a,Anhedonia is the worst part of depression,"You're alive, but you aren't living. You feel ...",depressed,depression
2,2,i2h6dv,"Life doesn't ""get better"" unless you take acti...","Exercise/physical activity, eating healthy, sp...",depressed,depression
3,3,g6jgq0,High functioning depression is so easily over ...,I try and do anything and everything all day. ...,depressed,depression
4,4,kgmy1e,I secretly hope that I will be diagnosed with ...,"Hey Reddit. Drunk me here, mainly looking to v...",depressed,depression


In [4]:
# checking duplicate posts
dataset.duplicated('Post_id').sum() # 0 => no duplicate posts

# beacause in the scapping code we mentioned 
# if post.id in seen_ids:
#     continue

0

In [5]:
# since our focus is on Text and its emotion 
# lets pick them 

df = dataset[['Text' , 'emotion']]
df.head()

Unnamed: 0,Text,emotion
0,I can’t imagine how many days I have wasted to...,depressed
1,"You're alive, but you aren't living. You feel ...",depressed
2,"Exercise/physical activity, eating healthy, sp...",depressed
3,I try and do anything and everything all day. ...,depressed
4,"Hey Reddit. Drunk me here, mainly looking to v...",depressed


In [None]:
# seeing unique emotions
df['emotion'].value_counts()

# these are the emotions and number of corresponding entries per emotion

emotion
fearful      4019
depressed    3990
happy        3834
regret       3559
angry        3507
sad          3441
surprised    2605
neutral      2261
disgusted    2203
Name: count, dtype: int64

In [7]:
# lets convert all this text into lowercase
df['Text'] = df['Text'].str.lower()

In [8]:
df.head()

Unnamed: 0,Text,emotion
0,i can’t imagine how many days i have wasted to...,depressed
1,"you're alive, but you aren't living. you feel ...",depressed
2,"exercise/physical activity, eating healthy, sp...",depressed
3,i try and do anything and everything all day. ...,depressed
4,"hey reddit. drunk me here, mainly looking to v...",depressed


In [9]:
# removing any url from the text if present
df['Text'] = df['Text'].apply(lambda sentence : re.sub('\b(?:https?|ftp|ssh)://\S+', '' , str(sentence)))

In [10]:
# check for extra white spaces
df['Text'] = df['Text'].apply(lambda sentence : ' '.join(sentence.split()))

# how this is working

# sentence = 'this is a   amazing    method  to remove  extra  spaces.    '
# words = sentence.split() # ['this', 'is', 'a', 'amazing', 'method', 'to', 'remove', 'extra', 'spaces.']
# ' '.join(words) # 'this is a amazing method to remove extra spaces.


####  lets do lemmatization


In [11]:
# required packages
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords , wordnet
import nltk

In [12]:
# also we will keep in mind we lemmatize only those words which are not stopwords of english language
lemmatizer = WordNetLemmatizer()

stopwords_eng = stopwords.words('english')
corpus = []


# defining pos_tag mapper for WordNetLemmatizer as it does not understand default pos_tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

for i in range(len(df)):
    
    # removing the non alphabetic characters
    text_reviewed = re.sub('[^a-zA-Z]' , ' ' , df['Text'][i])
    # [^a-zA-Z] => replace anything except alphabets with ' '

    # extract words
    words = text_reviewed.split()

    # lets provide them pos_tag
    words_pos_tag = nltk.pos_tag(words)
    # it is a list of tuples , where each tuple contains word and its pos tag

    # pos_tag gives the tags in the format : NN , VB , VBZ , JJ , but WorNetLemmatizer excepts them in its format.
    # it supports only four tags : n , v , a , r

    # lemmatize the word if it is not stopword
    lemmatized_words = [lemmatizer.lemmatize(word = tup[0] , pos = get_wordnet_pos(tup[1]))
                    for tup in words_pos_tag if tup[0] not in stopwords_eng]
    
    # join these lemmatized words to get back a sentence
    corpus.append(' '.join(lemmatized_words))


In [13]:
corpus[:5]

['imagine many day waste sad cry able accomplish thing need responsibility deadline none seem matter dark episodes force need do feel hopeless numb',
 'alive living feel like something anything nothing appeal everything feel like chore exhaust thing enjoy work since everything feel like watch paint dry realising slowly begin lose interest thing enjoy realise nothing mentally torture like try catch smoke bare hand hopeless thing look forward death',
 'exercise physical activity eat healthy spending time nature find hobby thing suppose good mental health definition depression problem precisely inhibit inability function much less thing good find therapist trialling multiple therapist see one suit best even start antidepressant check doctor psychiatrist every week change medication one might fuck head even even work take much fucking effort bother go counsellor university earlier year suggest thing like group therapy anxiety volunteer cause give people sense purpose day day life know tell

In [47]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['emotion'] = encoder.fit_transform(df['emotion'])

In [54]:
encoder.classes_

array(['angry', 'depressed', 'disgusted', 'fearful', 'happy', 'neutral',
       'regret', 'sad', 'surprised'], dtype=object)

In [14]:
# now its time to divide the data into training and testing set 
# because we don't want any data leakage by training word2vec on entore corpus first and then splitting

# e.g. if we have a test point like : "I feel really sad."
# and our word2vec model has already seen similar words or complete phrase while training the entire corpus
# then we will be getting very good embeddings , leading to higher accuracy later on
# but this accuracy is not the correct accuracy becasue our model already knoew that data , hence lead to overfitting
# so to avoid this , lets first split

In [48]:
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train , Y_test = train_test_split(df['Text'] , df['emotion'] , train_size=0.8 , random_state=123456)

In [16]:
for i in range(len(X_train)):
    print(X_train[i])
#index issue

i can’t imagine how many days i have wasted to being sad and crying and not being able to accomplish the things i need to do. i have responsibilities i have deadlines and none of them seem to matter when i am in these dark episodes. how can you force yourself to do what needs to be done when you feel so hopeless and numb?
you're alive, but you aren't living. you feel like doing something, anything, but nothing is appealing. everything feels like a chore, and it's exhausting. "do things that you enjoy doing" doesn't work, since everything feels like watching paint dry. realising that you're slowly beginning to lose interest in the few things you enjoy doing and realising that there's nothing you can do about it is mentally torturing. it's like trying to catch smoke with your bare hands; it's hopeless. the only thing you look forwards to is death.
exercise/physical activity, eating healthy, spending time in nature, finding a hobby are all things that are supposed to "be good for your men

KeyError: 4

In [17]:
X_train

15245    i constantly see people on facebook spamming t...
25827    old chinese wisdom :a moral story a farmer in ...
6874     back in the early 1990's, i was 14 years old a...
3703     i met my ex husband at 15, i fell for him imme...
28468    is it possible for me to watermark hundreds of...
                               ...                        
3640     my (20m) little brother (16m) has suffered fro...
3121     i'll just get straight to the point, three mon...
20978    i'm going to try laying it a bit bare here. [i...
23274    good day everyone, i would like to take your a...
6209     my boyfriend is an ra at our university and hi...
Name: Text, Length: 23535, dtype: object

In [18]:
X_train.reset_index().drop('index' , axis=1)
# lets save it as well and similarly do it for all

Unnamed: 0,Text
0,i constantly see people on facebook spamming t...
1,old chinese wisdom :a moral story a farmer in ...
2,"back in the early 1990's, i was 14 years old a..."
3,"i met my ex husband at 15, i fell for him imme..."
4,is it possible for me to watermark hundreds of...
...,...
23530,my (20m) little brother (16m) has suffered fro...
23531,"i'll just get straight to the point, three mon..."
23532,i'm going to try laying it a bit bare here. [i...
23533,"good day everyone, i would like to take your a..."


In [49]:
X_train = X_train.reset_index().drop('index' , axis=1)
Y_train = Y_train.reset_index().drop('index' , axis=1)
X_test = X_test.reset_index().drop('index' , axis=1)
Y_test = Y_test.reset_index().drop('index' , axis=1)

In [20]:
X_test

Unnamed: 0,Text
0,sometimes you feel all lonely and/or sad and f...
1,in the wake of recent events i've read a lot o...
2,"i was crossing the street, looked left and rig..."
3,preface: i’m a big fan of his work (and art in...
4,so last night i was at a little party and i de...
...,...
5879,so i’m a gay man. totally at ease with it. i l...
5880,i think about this sometimes during long thund...
5881,a prime example. i came out of welcome to racc...
5882,never ever thought that it was mark hamill tha...


In [21]:
# now lets convert X_train into vector using word2vec
# not passing the X_test because we dont want the model to get the idea of test set before hand

In [22]:
# using word2vec to convert text into vector 
from gensim.models import Word2Vec

In [23]:
# lets train word2vec model from scratch
# our corpus right now is -> list of sentences
# but Word2Vec requires list of tokenized sentences i.e., list of list of tokens

tokenized_sentences = [sentence.split() for sentence in X_train['Text']]

word2vec_model = Word2Vec(sentences=tokenized_sentences , vector_size = 250)

# sentences -> list of tokenized sentences
# vector_Size = dimensionof vector

In [24]:
# vocabulary of my X_train
word2vec_model.wv.index_to_key

['i',
 'to',
 'and',
 'the',
 'a',
 'my',
 'of',
 'that',
 'was',
 'in',
 'it',
 'for',
 'you',
 'me',
 'but',
 'is',
 'with',
 'this',
 'have',
 'so',
 'he',
 'she',
 'on',
 'just',
 'be',
 'not',
 'her',
 'at',
 'we',
 'as',
 'about',
 'like',
 'had',
 'or',
 'they',
 'all',
 'if',
 'when',
 "i'm",
 'what',
 'out',
 'are',
 'because',
 'up',
 'get',
 'do',
 'from',
 'would',
 'your',
 'how',
 'know',
 'people',
 'been',
 'one',
 'an',
 'even',
 'feel',
 'him',
 'can',
 'his',
 "don't",
 'no',
 'time',
 'who',
 'some',
 'want',
 'more',
 'really',
 'never',
 'will',
 'am',
 'were',
 "it's",
 'there',
 'has',
 'by',
 'them',
 'got',
 'going',
 'being',
 'our',
 'after',
 'then',
 'go',
 'could',
 'think',
 'it.',
 'me.',
 'their',
 'told',
 'only',
 'much',
 'i’m',
 'into',
 'back',
 'make',
 'still',
 'things',
 'now',
 'other',
 "didn't",
 'life',
 "i've",
 'over',
 'said',
 'did',
 'something',
 'very',
 'see',
 'any',
 'than',
 'myself',
 'always',
 'good',
 'made',
 'also',
 'ever

In [25]:
# vocab size
word2vec_model.corpus_count

23535

In [26]:
word2vec_model.wv['call'] # see how our vector of 250 dimensions look like

array([-2.74061620e-01, -1.13216209e+00, -2.90534496e+00, -1.31299233e+00,
        7.96692848e-01, -1.31234789e+00,  4.20345128e-01,  1.51930153e-01,
        6.56283855e-01,  3.03263098e-01,  1.33362556e+00,  6.39973462e-01,
       -2.96796679e-01,  1.06594801e+00,  6.27940357e-01,  4.21255268e-02,
       -3.64112645e-01,  2.05537021e-01, -4.50473686e-04, -6.17356122e-01,
        1.31563678e-01, -2.43091464e+00,  1.36672711e+00, -3.89994204e-01,
        3.00560445e-01,  3.80706906e-01, -2.91409403e-01,  5.81164598e-01,
        9.00105909e-02,  7.19284356e-01, -1.07735074e+00, -1.41918492e+00,
        4.63737547e-01,  4.54261247e-03, -1.24501216e+00, -2.48068929e+00,
       -2.80575067e-01, -1.08476905e-02,  6.58872545e-01,  1.36725783e+00,
        4.02355343e-01,  4.09904450e-01, -6.94465712e-02,  1.47904277e+00,
        1.55295742e+00,  1.20790374e+00,  5.00590682e-01, -9.92810130e-01,
       -1.07051122e+00, -6.90518737e-01,  6.48398459e-01, -2.38006949e+00,
       -9.32627618e-02, -

### Using AvgWord2Vec

In [27]:
# why ?

# because
tokenized_sentences[0]

# right now each word is getting converted into a vector which has 100 dimension
# if we want the entire sentence to get converted into 1 single vector of 100 dimension , then we will take the avg.
# i-th dimension of resultant vector will be avg of i-th dimension of all the vectors of individual words 
# resultant_vector[i] = avg(v1[i] , v2[i] , .....)

['i',
 'constantly',
 'see',
 'people',
 'on',
 'facebook',
 'spamming',
 'their',
 'little',
 'mutt',
 'on',
 'their',
 'page',
 'with',
 'some',
 'kind',
 'of',
 'bodily',
 'waste',
 'running',
 'down',
 'themselves,',
 'whether',
 "it's",
 'piss,',
 'snot',
 'or',
 'spit.',
 "it's",
 'all',
 'fucking',
 'gross',
 'and',
 'makes',
 'me',
 'wanna',
 'gag.',
 'i',
 "don't",
 'know',
 'what',
 'they',
 'find',
 'cute',
 'about',
 'it.',
 'on',
 'a',
 'related',
 'note,',
 'birthday',
 'cakes.',
 'everyone',
 'else',
 'starts',
 'cooing',
 'and',
 'awwing',
 'when',
 'the',
 'toddler,',
 'with',
 'a',
 'finger',
 'ridden',
 'with',
 'snot',
 'and',
 'shit',
 "that's",
 'probably',
 'been',
 'in',
 'his',
 'diaper',
 'for',
 '2',
 'hours,',
 'starts',
 'smearing',
 'his',
 'finger',
 'in',
 'the',
 'cake',
 'that',
 'everyone',
 'else',
 'is',
 'gonna',
 'eat.',
 'no',
 'repercussions,',
 'no',
 'consequences,',
 'everyone',
 'just',
 'happily',
 'eats',
 'the',
 'disgusting',
 'cake.']

In [30]:
import numpy as np
def avg_word2vec(document):

    vocab = word2vec_model.wv.index_to_key

    # if the word is in vocab only then create its vector
    # creating a list of vectors of all the words in a sentence/document
    vectors_document = [word2vec_model.wv[word] for word in document if word in vocab]

    # converting it to array
    array_document = np.array(vectors_document)

    if not vectors_document:
        return np.zeros(word2vec_model.vector_size)
    
    # otherwise return the mean vector
    return np.mean(array_document , axis=0)

In [31]:
# to see the progress
from tqdm import tqdm

X_train_new = []
# applying avg_word2vec on entire sentence
for sentence in tqdm(tokenized_sentences):
    X_train_new.append(avg_word2vec(sentence))

  0%|          | 0/23535 [00:00<?, ?it/s]

100%|██████████| 23535/23535 [14:02<00:00, 27.93it/s]


In [33]:
len(X_train_new)

23535

In [34]:
X_train_new[0] # this is the vector representation of our first word

array([ 3.79912674e-01, -5.90954423e-01,  4.45225537e-01, -3.72245580e-01,
        2.64145378e-02, -4.10711437e-01,  2.30983160e-02,  2.60745257e-01,
       -3.48573066e-02,  1.90578818e-01,  1.03033692e-01,  2.72665489e-02,
       -3.47765833e-01, -6.07445687e-02, -8.55252668e-02, -1.53688252e-01,
        1.08806111e-01,  4.95494492e-02,  8.84844884e-02,  5.35102822e-02,
        1.55950531e-01,  2.29943588e-01,  4.64942344e-02, -3.86003591e-02,
       -1.83471441e-01,  1.91023961e-01, -1.32702023e-01, -2.59376168e-01,
       -1.70553476e-01,  3.89798462e-01, -1.92828238e-01,  1.54492795e-01,
       -2.35888913e-01, -1.70701534e-01,  4.50831838e-02, -1.78148702e-01,
       -1.03662601e-02,  5.06278202e-02,  1.46593302e-01, -2.56665975e-01,
        3.92356098e-01,  1.01448558e-01, -1.35214135e-01,  2.51872391e-01,
        2.83934295e-01, -9.45845917e-02,  4.53401096e-02,  1.52713344e-01,
        1.80990338e-01, -2.98866332e-02, -2.65645295e-01,  8.65659937e-02,
       -8.85044932e-02,  

In [57]:
# let me save my training set
training_data = pd.DataFrame(X_train_new)
training_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.379913,-0.590954,0.445226,-0.372246,0.026415,-0.410711,0.023098,0.260745,-0.034857,0.190579,...,0.033488,0.025612,0.0793,-0.453153,-0.135339,-0.253434,-0.118945,0.256364,-0.166853,-0.26096
1,0.323991,-0.519815,0.377424,-0.534864,0.066183,-0.452272,-0.073674,0.497239,0.128706,0.358055,...,0.059999,0.136055,-0.042111,-0.23598,-0.016412,-0.035141,-0.032346,0.393198,-0.361641,-0.19168
2,0.470891,-0.706433,0.280104,-0.504282,0.323466,-0.730467,0.135459,0.460936,0.126155,0.348441,...,-0.114902,0.196887,0.016223,-0.576676,-0.036948,0.011567,0.001991,0.594631,-0.468152,-0.564023
3,0.294158,-0.747437,0.355648,-0.554944,0.25286,-0.670446,0.154853,0.584519,0.148885,0.36558,...,-0.155857,0.183379,-0.055441,-0.470433,0.056903,0.031841,0.031261,0.483978,-0.470758,-0.591772
4,0.287419,-0.676872,0.378734,-0.631298,-0.011777,-0.682109,0.091714,0.358397,0.000933,0.27537,...,-0.096478,0.126026,0.000288,-0.760246,0.158944,-0.014274,-0.035016,0.461706,-0.415457,-0.326376


In [60]:
# each row represents my vector representation of the corresponding sentence in X_train set
# each vector has 250 dimensions
len(training_data)

23535

In [61]:
# lets append the Y_train column as well into this dataframe
training_data['emotion'] = Y_train['emotion']
training_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,emotion
0,0.379913,-0.590954,0.445226,-0.372246,0.026415,-0.410711,0.023098,0.260745,-0.034857,0.190579,...,0.025612,0.0793,-0.453153,-0.135339,-0.253434,-0.118945,0.256364,-0.166853,-0.26096,2
1,0.323991,-0.519815,0.377424,-0.534864,0.066183,-0.452272,-0.073674,0.497239,0.128706,0.358055,...,0.136055,-0.042111,-0.23598,-0.016412,-0.035141,-0.032346,0.393198,-0.361641,-0.19168,4
2,0.470891,-0.706433,0.280104,-0.504282,0.323466,-0.730467,0.135459,0.460936,0.126155,0.348441,...,0.196887,0.016223,-0.576676,-0.036948,0.011567,0.001991,0.594631,-0.468152,-0.564023,6
3,0.294158,-0.747437,0.355648,-0.554944,0.25286,-0.670446,0.154853,0.584519,0.148885,0.36558,...,0.183379,-0.055441,-0.470433,0.056903,0.031841,0.031261,0.483978,-0.470758,-0.591772,1
4,0.287419,-0.676872,0.378734,-0.631298,-0.011777,-0.682109,0.091714,0.358397,0.000933,0.27537,...,0.126026,0.000288,-0.760246,0.158944,-0.014274,-0.035016,0.461706,-0.415457,-0.326376,5


In [64]:
# lets save it 
training_data.to_csv('./output/train_set.csv')

In [65]:
#lets check it 
temp = pd.read_csv('./output/train_set.csv')
temp.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,241,242,243,244,245,246,247,248,249,emotion
0,0,0.379913,-0.590954,0.445226,-0.372246,0.026415,-0.410711,0.023098,0.260745,-0.034857,...,0.025612,0.0793,-0.453153,-0.135339,-0.253434,-0.118945,0.256364,-0.166853,-0.26096,2
1,1,0.323991,-0.519815,0.377424,-0.534864,0.066183,-0.452272,-0.073674,0.497239,0.128706,...,0.136055,-0.042111,-0.23598,-0.016412,-0.035141,-0.032346,0.393198,-0.361641,-0.19168,4
2,2,0.470891,-0.706433,0.280104,-0.504282,0.323466,-0.730467,0.135459,0.460936,0.126155,...,0.196887,0.016223,-0.576676,-0.036948,0.011567,0.001991,0.594631,-0.468152,-0.564023,6
3,3,0.294158,-0.747437,0.355648,-0.554944,0.25286,-0.670446,0.154853,0.584519,0.148885,...,0.183379,-0.055441,-0.470433,0.056903,0.031841,0.031261,0.483978,-0.470758,-0.591772,1
4,4,0.287419,-0.676872,0.378734,-0.631298,-0.011777,-0.682109,0.091714,0.358397,0.000933,...,0.126026,0.000288,-0.760246,0.158944,-0.014274,-0.035016,0.461706,-0.415457,-0.326376,5


In [None]:
temp.drop('Unnamed: 0' , inplace=True , axis =  1)
temp.head()

# training set is done

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,emotion
0,0.379913,-0.590954,0.445226,-0.372246,0.026415,-0.410711,0.023098,0.260745,-0.034857,0.190579,...,0.025612,0.0793,-0.453153,-0.135339,-0.253434,-0.118945,0.256364,-0.166853,-0.26096,2
1,0.323991,-0.519815,0.377424,-0.534864,0.066183,-0.452272,-0.073674,0.497239,0.128706,0.358055,...,0.136055,-0.042111,-0.23598,-0.016412,-0.035141,-0.032346,0.393198,-0.361641,-0.19168,4
2,0.470891,-0.706433,0.280104,-0.504282,0.323466,-0.730467,0.135459,0.460936,0.126155,0.348441,...,0.196887,0.016223,-0.576676,-0.036948,0.011567,0.001991,0.594631,-0.468152,-0.564023,6
3,0.294158,-0.747437,0.355648,-0.554944,0.25286,-0.670446,0.154853,0.584519,0.148885,0.36558,...,0.183379,-0.055441,-0.470433,0.056903,0.031841,0.031261,0.483978,-0.470758,-0.591772,1
4,0.287419,-0.676872,0.378734,-0.631298,-0.011777,-0.682109,0.091714,0.358397,0.000933,0.27537,...,0.126026,0.000288,-0.760246,0.158944,-0.014274,-0.035016,0.461706,-0.415457,-0.326376,5


In [69]:
type(Y_train['emotion'])

pandas.core.series.Series

In [68]:
# lets create a model
from sklearn.naive_bayes import GaussianNB

# model object
model_nb = GaussianNB()

# train the model
model_nb.fit(X_train_new , Y_train['emotion'])

In [None]:
# lets do testing now
X_test # but X_test is purely text 

Unnamed: 0,Text
0,sometimes you feel all lonely and/or sad and f...
1,in the wake of recent events i've read a lot o...
2,"i was crossing the street, looked left and rig..."
3,preface: i’m a big fan of his work (and art in...
4,so last night i was at a little party and i de...
...,...
5879,so i’m a gay man. totally at ease with it. i l...
5880,i think about this sometimes during long thund...
5881,a prime example. i came out of welcome to racc...
5882,never ever thought that it was mark hamill tha...


In [71]:
# now lets use our model Word2Vec to convert text of X_test to vector

# because Word2Vec requires list of tokenized sentences
X_test_tokens = [sentence.split() for sentence in X_test['Text']]

# using function avg_word2vec to do the job , exactly like it did for training set
X_test_new = []
# applying avg_word2vec on entire sentence
for sentence in tqdm(X_test_tokens):
    X_test_new.append(avg_word2vec(sentence))

100%|██████████| 5884/5884 [03:47<00:00, 25.91it/s]


In [75]:
X_test_new[0]

array([ 0.22710732, -0.41303653,  0.49133152, -0.31259632, -0.18476617,
       -0.8872622 , -0.17258127,  0.12168251, -0.24634708, -0.1247347 ,
        0.29672343,  0.36078936, -0.16671893, -0.09409644,  0.12877515,
       -0.26745996,  0.49016753, -0.21245307,  0.09040833,  0.2812171 ,
        0.2891945 ,  0.18651561,  0.25615937,  0.3089376 ,  0.09201322,
        0.2603449 ,  0.13275301, -0.18867798,  0.15276839,  0.29056263,
        0.32302284, -0.4109478 ,  0.2583388 , -0.2388297 ,  0.56595767,
       -0.09781045,  0.1389403 ,  0.416598  ,  0.11626725, -0.06388639,
       -0.10223866,  0.2692013 ,  0.0720349 ,  0.19931905,  0.2018698 ,
       -0.25639495, -0.0343442 ,  0.5145468 ,  0.06320782,  0.02922744,
        0.22642133,  0.23809183, -0.17197163, -0.42150694,  0.27561507,
       -0.4487381 , -0.33957475, -0.6030782 ,  0.10871722,  0.1818879 ,
        0.12295569, -0.06855067,  0.37379932,  0.1486631 , -0.16836277,
       -0.00376937, -0.92815506, -0.09867527, -0.3230955 ,  0.10

In [76]:
# now lets predict
Y_pred = model_nb.predict(X_test_new)

In [77]:
from sklearn.metrics import classification_report
print(classification_report(Y_test['emotion'] , Y_pred))

              precision    recall  f1-score   support

           0       0.39      0.12      0.18       695
           1       0.27      0.22      0.24       792
           2       0.31      0.43      0.36       440
           3       0.42      0.25      0.32       812
           4       0.41      0.48      0.44       763
           5       0.30      0.36      0.33       458
           6       0.27      0.77      0.40       715
           7       0.36      0.12      0.18       692
           8       0.31      0.13      0.18       517

    accuracy                           0.32      5884
   macro avg       0.34      0.32      0.29      5884
weighted avg       0.34      0.32      0.29      5884



In [None]:
# lets save the word2vec model as well for later use , but make sure to create avg_word2vec function
# in that file as well , because have used avg_word2vec to train

word2vec_model.save("./output/word2vec_emotion.model")