### Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np

#### Methods for sentiment analysis

1.Logistic Regression or Naive Bayes

2.Simple RNNs or LSTM Neural Networks

3.Transformers and Bert

#### Encodings

1.CountVectorizer(Bag of words) and TFIDFtransformer

2.Word2Vec

3.Word Embedding

4.Bert Encodings

In [2]:
df = pd.read_csv('spam.csv',usecols = ['v1','v2'])

In [3]:
df.columns = ['cateogry','message']

In [4]:
df.head()

Unnamed: 0,cateogry,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['class'] = df['cateogry'].apply(lambda x:1 if x=='spam' else 0)

In [6]:
df.head()

Unnamed: 0,cateogry,message,class
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
df = df.drop('cateogry',axis=1)

In [8]:
df.head()

Unnamed: 0,message,class
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


##### Balancing the unbalanced dataset

In [9]:
df.groupby('class').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4825,4516,"Sorry, I'll call later",30
1,747,653,Please call our customer service representativ...,4


In [10]:
df[df['class']==0].count()

message    4825
class      4825
dtype: int64

In [11]:
df[df['class']==1].count()

message    747
class      747
dtype: int64

In [12]:
df_ham = df[df['class']==0]
df_spam = df[df['class']==1]

In [13]:
df_spam.shape

(747, 2)

In [14]:
df_ham_final = df_ham.sample(df_spam.shape[0])

In [15]:
df_ham_final.head()

Unnamed: 0,message,class
2174,See? I thought it all through,0
211,Home so we can always chat,0
5146,Oh unintentionally not bad timing. Great. Fing...,0
3119,Good evening! this is roger. How are you?,0
1669,Very hurting n meaningful lines ever: \I compr...,0


In [16]:
df_final = pd.concat([df_ham_final,df_spam])

In [17]:
df_final.head()

Unnamed: 0,message,class
2174,See? I thought it all through,0
211,Home so we can always chat,0
5146,Oh unintentionally not bad timing. Great. Fing...,0
3119,Good evening! this is roger. How are you?,0
1669,Very hurting n meaningful lines ever: \I compr...,0


##### Count Vectorizer and TFIDF

In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [19]:
cv = CountVectorizer()

In [20]:
text = ['ram is a good boy','ravi is a good boy','she is a good girl']

In [21]:
final = cv.fit_transform(text)

In [22]:
final.toarray()

array([[1, 0, 1, 1, 1, 0, 0],
       [1, 0, 1, 1, 0, 1, 0],
       [0, 1, 1, 1, 0, 0, 1]], dtype=int64)

In [23]:
cv.get_feature_names()

['boy', 'girl', 'good', 'is', 'ram', 'ravi', 'she']

In [24]:
test = 'she is not santa claus'
test_final = cv.transform([test])
print(test_final.toarray())

[[0 0 0 1 0 0 1]]


In [25]:
tf = TfidfTransformer()

In [26]:
tffinal = tf.fit_transform(final)

In [27]:
tffinal.toarray()

array([[0.50410689, 0.        , 0.39148397, 0.39148397, 0.66283998,
        0.        , 0.        ],
       [0.50410689, 0.        , 0.39148397, 0.39148397, 0.        ,
        0.66283998, 0.        ],
       [0.        , 0.6088451 , 0.35959372, 0.35959372, 0.        ,
        0.        , 0.6088451 ]])

In [28]:
tftest = tf.transform(test_final)

In [29]:
tftest.toarray()

array([[0.        , 0.        , 0.        , 0.50854232, 0.        ,
        0.        , 0.861037  ]])

##### Cleaning the Messages

In [30]:
import nltk 
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saibh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
stop_words = set(list(stopwords.words('english')))

In [32]:
from nltk import tokenize

In [33]:
message_sample = df_final.iloc[0]['message']

In [34]:
message_sample

'See? I thought it all through'

In [35]:
tokens = nltk.word_tokenize(message_sample)

In [36]:
tokens = [w for w in tokens if w not in stop_words]

In [37]:
tokens

['See', '?', 'I', 'thought']

In [38]:
import string

In [39]:
puncs = list(string.punctuation)

In [40]:
puncs

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [41]:
tokens = [w for w in tokens if w not in puncs]

In [42]:
tokens

['See', 'I', 'thought']

In [43]:
final_text = ' '.join(tokens)

In [44]:
final_text

'See I thought'

In [45]:
import re
def cleaning(col):
    tokens = nltk.word_tokenize(col)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [w for w in tokens if w not in puncs]
    tweet = ' '.join(tokens)
    #removing old style RT tweets
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    #remove hastags
    tweet = re.sub(r'#', '', tweet)
    #remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    #lower casing
    tweet = tweet.lower()
    #only to keep data from a-z
    tweet = re.sub(r'[^(a-zA-z)\s]','',tweet)
    return tweet    

In [46]:
df_final['clean_messages'] = df_final['message'].apply(cleaning)

In [47]:
df_final.head()

Unnamed: 0,message,class,clean_messages
2174,See? I thought it all through,0,see i thought
211,Home so we can always chat,0,home always chat
5146,Oh unintentionally not bad timing. Great. Fing...,0,oh unintentionally bad timing great fingers tr...
3119,Good evening! this is roger. How are you?,0,good evening roger how
1669,Very hurting n meaningful lines ever: \I compr...,0,very hurting n meaningful lines ever \i compro...


In [48]:
count_vect = CountVectorizer()
Tfidf_Trans = TfidfTransformer()

In [49]:
count_vect.fit(df_final['clean_messages'])
Final_messages = count_vect.transform(df_final['clean_messages'])

In [129]:
Final_messages.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [130]:
count_vect.get_feature_names()

['____',
 'aaooooright',
 'ab',
 'abdomen',
 'aberdeen',
 'abi',
 'ability',
 'able',
 'about',
 'abroad',
 'abt',
 'abta',
 'abuse',
 'abusers',
 'ac',
 'acc',
 'accept',
 'access',
 'accessible',
 'accident',
 'accidentally',
 'accommodation',
 'accommodationvouchers',
 'accordingly',
 'account',
 'ache',
 'acid',
 'aclpm',
 'aco',
 'across',
 'acsmsrewards',
 'actin',
 'action',
 'activ',
 'activate',
 'active',
 'actor',
 'actually',
 'acwicmbcktzr',
 'adam',
 'add',
 'addamsfa',
 'added',
 'addicted',
 'addie',
 'address',
 'admirer',
 'admission',
 'adore',
 'adp',
 'adrian',
 'ads',
 'adult',
 'advance',
 'adventuring',
 'advice',
 'advise',
 'advisors',
 'ae',
 'affection',
 'afraid',
 'african',
 'aft',
 'after',
 'afternoon',
 'ag',
 'age',
 'agent',
 'ageperwksub',
 'ageppermesssubscription',
 'ages',
 'agidhane',
 'ago',
 'agree',
 'agreen',
 'ah',
 'aha',
 'ahead',
 'ahmad',
 'ahthe',
 'ai',
 'aight',
 'aint',
 'airport',
 'airtel',
 'aiya',
 'aiyo',
 'aj',
 'akonlonely',


In [131]:
Final_context = Tfidf_Trans.fit_transform(Final_messages)

In [135]:
Final_context.toarray().shape

(1494, 3819)

##### Sampling the messages

In [136]:
df_final = df_final.drop('message',axis=1)

In [137]:
df_final.head()

Unnamed: 0,class,clean_messages
279,0,all done all handed celebrations full swing yet
3608,0,joy s father john then john ____ joy s father ...
1567,0,whatever im pretty pissed
1293,0,happy birthday may ur dreams come true
409,0,message text missing sender name missing numbe...


In [139]:
X_train,X_test,y_train,y_test = train_test_split(df_final['clean_messages'],df_final['class'],test_size=0.3,random_state=101)

In [143]:
CV = CountVectorizer()
TF = TfidfTransformer()

In [144]:
Final_cv = CV.fit_transform(X_train)

In [145]:
Final_tf = TF.fit_transform(Final_cv)

In [147]:
Final_tf.toarray().shape

(1045, 3071)

In [150]:
len(y_train)

1045

In [148]:
from sklearn.linear_model import LogisticRegression

In [149]:
lr = LogisticRegression()

In [151]:
lr.fit(Final_tf.toarray(),y_train)

LogisticRegression()

In [152]:
Final_cv_test = CV.transform(X_test)

In [153]:
Final_tf_test = TF.transform(Final_cv_test)

In [154]:
preds = lr.predict(Final_tf_test.toarray())

In [156]:
from sklearn.metrics import classification_report,confusion_matrix

In [158]:
print(confusion_matrix(preds,y_test))

[[220  20]
 [  6 203]]


In [159]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94       226
           1       0.97      0.91      0.94       223

    accuracy                           0.94       449
   macro avg       0.94      0.94      0.94       449
weighted avg       0.94      0.94      0.94       449



In [161]:
X_train[910]

'my love how come took long leave zaher s i got words ym happy see sad left i miss'

In [162]:
sample_cv = CV.transform([X_train[910]])

In [163]:
sample_tf = TF.transform(sample_cv)

In [164]:
lr.predict(sample_tf)

array([0], dtype=int64)

In [8]:
from sklearn.naive_bayes import BernoulliNB

##### BernouliNB is better for Binary classification in general

In [172]:
NBM = BernoulliNB()

In [173]:
NBM.fit(Final_tf.toarray(),y_train)

BernoulliNB()

In [174]:
predicts = NBM.predict(Final_tf_test.toarray())

In [175]:
predicts

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [176]:
print(confusion_matrix(predicts,y_test))

[[225  25]
 [  1 198]]


### RNN Method

##### Word2vec Embedding

In [187]:
sentences = ['an apple a day keeps the doctor away and keeps your healthy',
             'apple is looking to launch a new iphone this year']

In [188]:
import gensim

In [7]:
from gensim.models import Word2Vec



In [190]:
corpus = [gensim.utils.simple_preprocess(i) for i in sentences]

In [192]:
print(corpus)

[['an', 'apple', 'day', 'keeps', 'the', 'doctor', 'away', 'and', 'keeps', 'your', 'healthy'], ['apple', 'is', 'looking', 'to', 'launch', 'new', 'iphone', 'this', 'year']]


In [193]:
model = Word2Vec(window=5,min_count=1,workers=4)

In [194]:
model.build_vocab(corpus,progress_per=2)

In [195]:
model.epochs

5

In [196]:
model.corpus_count

2

In [197]:
model.train(sentences,epochs=model.epochs,total_examples=corpus.count)

(0, 540)

In [198]:
model.wv.key_to_index

{'apple': 0,
 'keeps': 1,
 'year': 2,
 'this': 3,
 'day': 4,
 'the': 5,
 'doctor': 6,
 'away': 7,
 'and': 8,
 'your': 9,
 'healthy': 10,
 'is': 11,
 'looking': 12,
 'to': 13,
 'launch': 14,
 'new': 15,
 'iphone': 16,
 'an': 17}

In [200]:
vector1 = model.wv.get_vector('iphone')

In [201]:
vector2 = model.wv.get_vector('apple')

In [202]:
vector3 = model.wv.get_vector('doctor')

In [203]:
from sklearn.metrics.pairwise import cosine_similarity

In [204]:
cosine_similarity([vector1],[vector2])

array([[0.21883951]], dtype=float32)

In [205]:
cosine_similarity([vector2],[vector3])

array([[0.01613472]], dtype=float32)

In [206]:
cosine_similarity([vector1],[vector3])

array([[0.16378775]], dtype=float32)

##### Embedding Layer Technique to get the Vectors

In [52]:
sentences = df_final['clean_messages']

In [53]:
sentences

2174                                        see i thought
211                                      home always chat
5146    oh unintentionally bad timing great fingers tr...
3119                               good evening roger how
1669    very hurting n meaningful lines ever \i compro...
                              ...                        
5537    want explicit sex  secs ring  costs pmin gsex ...
5540    asked mobile if  chatlines inclu in free mins ...
5547    had contract mobile  mnths latest motorola nok...
5566    reminder from o to get  pounds free call credi...
5567    this nd time tried  contact u u  pound prize  ...
Name: clean_messages, Length: 1494, dtype: object

In [54]:
from keras.preprocessing.text import one_hot
from keras.layers import Embedding
from keras.models import Sequential

In [55]:
len(sentences)

1494

In [60]:
vocab_size = 10000

In [67]:
maxi = 0

In [68]:
for i in sentences:
    tokens = nltk.word_tokenize(i)
    
    if maxi < len(tokens):
        maxi = len(tokens)

In [70]:
max_len = maxi

In [82]:
eg1 = ['boy is good','girl is good']
eg2 = ['good is glass','girl is also good']

In [83]:
one_hot_repr1 = [one_hot(i,50) for i in eg1]

In [84]:
one_hot_repr2 = [one_hot(i,50) for i in eg2]

In [85]:
one_hot_repr1

[[36, 23, 41], [2, 23, 41]]

In [86]:
one_hot_repr2

[[41, 23, 5], [2, 23, 4, 41]]

In [87]:
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [88]:
one_hot_rep = [one_hot(i,50) for i in sent]

In [89]:
one_hot_rep

[[10, 5, 43, 49],
 [10, 5, 43, 15],
 [10, 49, 43, 22],
 [17, 49, 29, 41, 36],
 [17, 49, 29, 41, 47],
 [38, 10, 26, 43, 18],
 [20, 47, 37, 41]]

##### We cab see that for one hot_repr for a same vocab_size and in the same kernel the words are geting same token number

In [91]:
df_final['clean_messages'].count()

1494

In [93]:
df_final[df_final['class']==0].count()

message           747
class             747
clean_messages    747
dtype: int64

In [94]:
from sklearn.model_selection import train_test_split

In [95]:
X_train,X_test,y_train,y_test = train_test_split(df_final['clean_messages'],df_final['class'],test_size=0.3,random_state=101)

##### One_hot_repr of X_train

In [96]:
one_hot_repr_xtrain = [one_hot(i,vocab_size) for i in X_train]

In [113]:
len(one_hot_repr_xtrain)

1045

##### Pad the sentences

In [100]:
from keras.preprocessing.sequence import pad_sequences

In [101]:
max_len

82

In [102]:
padded_X_train = pad_sequences(one_hot_repr_xtrain,max_len)

In [103]:
padded_X_train[0]#consider it as a vector of dimension 82

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,   94, 2466, 1985,
       1220, 8374,  542, 1771, 2466, 7050, 1139, 7860, 7193, 1139, 8374,
       7555, 7860, 7969, 5300, 6048])

In [110]:
import numpy as np
np.array(padded_X_train).shape

(1045, 82)

In [107]:
from keras.layers import Embedding,SimpleRNN,Dense,Dropout,Input
from keras.models import Sequential,Model

In [137]:
model1 = Sequential()
model1.add(Embedding(vocab_size,50,input_length=max_len))
model1.compile('adam','mse')
embedded_vectors = model1.predict(padded_X_train)

In [162]:
embedded_vectors_eg = embedded_vectors.reshape(1045,50,82)

In [157]:
embedded_vectors.shape

(1045, 82, 50)

In [174]:
model = Sequential()
model.add(SimpleRNN(128,input_shape=(50,82),activation='relu',return_sequences=True))
model.add(SimpleRNN(128,activation='relu'))
model.add(Dense(32,'relu'))
model.add(Dense(1,'sigmoid'))

In [175]:
model.compile(loss='binary_crossentropy',metrics='accuracy',optimizer='adam')

In [176]:
model.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_18 (SimpleRNN)    (None, 50, 128)           27008     
_________________________________________________________________
simple_rnn_19 (SimpleRNN)    (None, 128)               32896     
_________________________________________________________________
dense_16 (Dense)             (None, 32)                4128      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 33        
Total params: 64,065
Trainable params: 64,065
Non-trainable params: 0
_________________________________________________________________


In [177]:
model.fit(embedded_vectors_eg,y_train,epochs=50,verbose=2)

Epoch 1/50
33/33 - 2s - loss: 0.6150 - accuracy: 0.6766
Epoch 2/50
33/33 - 1s - loss: 0.4858 - accuracy: 0.7952
Epoch 3/50
33/33 - 1s - loss: 0.4429 - accuracy: 0.8191
Epoch 4/50
33/33 - 1s - loss: 0.4048 - accuracy: 0.8316
Epoch 5/50
33/33 - 1s - loss: 0.3390 - accuracy: 0.8660
Epoch 6/50
33/33 - 1s - loss: 0.2672 - accuracy: 0.9062
Epoch 7/50
33/33 - 1s - loss: 0.2099 - accuracy: 0.9215
Epoch 8/50
33/33 - 1s - loss: 0.1296 - accuracy: 0.9598
Epoch 9/50
33/33 - 1s - loss: 0.1054 - accuracy: 0.9627
Epoch 10/50
33/33 - 1s - loss: 0.0863 - accuracy: 0.9751
Epoch 11/50
33/33 - 1s - loss: 0.0629 - accuracy: 0.9799
Epoch 12/50
33/33 - 1s - loss: 0.0561 - accuracy: 0.9837
Epoch 13/50
33/33 - 1s - loss: 0.0428 - accuracy: 0.9885
Epoch 14/50
33/33 - 1s - loss: 0.0532 - accuracy: 0.9847
Epoch 15/50
33/33 - 1s - loss: 0.0395 - accuracy: 0.9904
Epoch 16/50
33/33 - 1s - loss: 0.0449 - accuracy: 0.9837
Epoch 17/50
33/33 - 1s - loss: 0.0398 - accuracy: 0.9885
Epoch 18/50
33/33 - 1s - loss: 0.0200 - 

<keras.callbacks.History at 0x22a63297820>

In [180]:
one_hot_repr_xtest = [one_hot(i,vocab_size) for i in X_test]
padded_xtest = pad_sequences(one_hot_repr_xtest,82)

In [181]:
embedded_vectors_test = model1.predict(padded_xtest)

In [183]:
embedded_vectors_test_eg = embedded_vectors_test.reshape(embedded_vectors_test.shape[0],50,82)

In [185]:
predictions = model.predict(embedded_vectors_test_eg)

In [186]:
preds = []

In [187]:
for i in predictions:
    if i>0.5:
        preds.append(1)
    else:
        preds.append(0)

In [188]:
preds

[1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,


In [189]:
y_test

692     0
4724    0
159     1
5129    0
800     1
       ..
3585    1
1427    0
2577    0
2334    0
203     0
Name: class, Length: 449, dtype: int64

In [190]:
from sklearn.metrics import confusion_matrix

In [191]:
confusion_matrix(y_test,preds)

array([[190,  36],
       [ 34, 189]], dtype=int64)

In [192]:
model.evaluate(embedded_vectors_test_eg,y_test)



[1.1786072254180908, 0.8440979719161987]

#### BERT ENCODING AND SENTIMENT ANALYSIS IN ANOTHER NOTEBOOK