In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
import gensim


In [3]:
from gensim.models import KeyedVectors

In [4]:
model = KeyedVectors.load_word2vec_format('../dataset/word2vec.6B.200d.txt')

In [5]:
model.most_similar(positive=['venom','women'] ,negative=['men'])

[('toxins', 0.48346906900405884),
 ('antivenom', 0.47847843170166016),
 ('neurotoxins', 0.473177969455719),
 ('venoms', 0.471465528011322),
 ('neurotoxic', 0.4678555130958557),
 ('venomous', 0.4646294414997101),
 ('symbiote', 0.4602070450782776),
 ('neurotoxin', 0.4597284197807312),
 ('poisonous', 0.4550253748893738),
 ('saliva', 0.4447315037250519)]

In [6]:
model.similar_by_word('escobar')

[('kelvim', 0.6507034301757812),
 ('medellin', 0.6469491720199585),
 ('pablo', 0.5724205374717712),
 ('yunel', 0.5493615865707397),
 ('guzman', 0.5385405421257019),
 ('beltran', 0.5345889329910278),
 ('fuentes', 0.5137689113616943),
 ('contreras', 0.5097950100898743),
 ('andres', 0.5078244209289551),
 ('kingpin', 0.5070193409919739)]

In [7]:
data = pd.read_csv('../dataset/imdb_labelled.txt' ,sep='\t',header = None, names=['Review','Sentiment'])
data.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [8]:
data.shape

(748, 2)

In [9]:
data.drop_duplicates(inplace=True)

In [10]:
data.shape

(745, 2)

In [11]:
X= data['Review']
y= data['Sentiment']


In [14]:

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [19]:
X

0      A very, very, very slow-moving, aimless movie ...
1      Not sure who was more lost - the flat characte...
2      Attempting artiness with black & white and cle...
3           Very little music or anything to speak of.  
4      The best scene in the movie was when Gerardo i...
                             ...                        
743    I just got bored watching Jessice Lange take h...
744    Unfortunately, any virtue in this film's produ...
745                     In a word, it is embarrassing.  
746                                 Exceptionally bad!  
747    All in all its an insult to one's intelligence...
Name: Review, Length: 745, dtype: object

In [24]:
tokenizer = Tokenizer()



In [25]:
tokenizer.fit_on_texts(X)

In [48]:
vocab_size= len(tokenizer.word_counts.keys())

In [49]:
word_index = tokenizer.word_index

In [50]:
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'is': 5,
 'this': 6,
 'i': 7,
 'it': 8,
 'to': 9,
 'in': 10,
 'was': 11,
 'movie': 12,
 'film': 13,
 'that': 14,
 '0': 15,
 '1': 16,
 'for': 17,
 'as': 18,
 'but': 19,
 'with': 20,
 'one': 21,
 'on': 22,
 'you': 23,
 'are': 24,
 'not': 25,
 'bad': 26,
 "it's": 27,
 'very': 28,
 'all': 29,
 'just': 30,
 'so': 31,
 'good': 32,
 'at': 33,
 'an': 34,
 'be': 35,
 'there': 36,
 'about': 37,
 'have': 38,
 'by': 39,
 'like': 40,
 'from': 41,
 'if': 42,
 'acting': 43,
 'time': 44,
 'his': 45,
 'or': 46,
 'out': 47,
 'really': 48,
 'great': 49,
 'even': 50,
 'he': 51,
 'who': 52,
 'were': 53,
 'has': 54,
 'see': 55,
 'my': 56,
 'characters': 57,
 'well': 58,
 'most': 59,
 'how': 60,
 'more': 61,
 'no': 62,
 'only': 63,
 'when': 64,
 'ever': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 '10': 71,
 'they': 72,
 'best': 73,
 'because': 74,
 'your': 75,
 'can': 76,
 'also': 77,
 "don't": 78,
 'films': 79,
 'than': 80,
 'its': 81,
 

In [51]:
tokens= tokenizer.texts_to_sequences(X)

In [52]:
tokens[0]

[3, 28, 28, 28, 287, 407, 1216, 12, 37, 3, 1217, 1218, 408, 143]

In [53]:
sentence_length= [len(x) for x in tokens]

min(sentence_length),max(sentence_length)

(1, 1400)

In [54]:
from collections import Counter
Counter(sentence_length)

Counter({14: 23,
         18: 18,
         29: 7,
         8: 37,
         21: 21,
         20: 30,
         3: 24,
         15: 30,
         10: 30,
         6: 38,
         11: 47,
         4: 26,
         16: 34,
         25: 14,
         17: 21,
         872: 1,
         12: 42,
         5: 36,
         19: 21,
         24: 15,
         34: 8,
         7: 35,
         23: 12,
         9: 37,
         2: 12,
         13: 25,
         26: 6,
         1: 3,
         37: 2,
         22: 14,
         27: 8,
         35: 4,
         200: 1,
         1400: 1,
         45: 4,
         28: 7,
         302: 1,
         43: 2,
         31: 8,
         55: 1,
         44: 2,
         33: 10,
         36: 5,
         69: 1,
         57: 1,
         32: 3,
         30: 6,
         73: 1,
         47: 1,
         38: 3,
         39: 1,
         53: 1,
         51: 1,
         42: 2,
         802: 1})

In [55]:
padded_tokens = pad_sequences(tokens, maxlen=50)

In [56]:
embedding_index={}
f= open('../dataset/glove.6B.200d.txt', encoding='UTF-8')
for line in f:
    values = line.split()
    word= values[0]
    coef= np.asarray(values[1:], dtype= 'float32')
    embedding_index[word]= coef
f.close()

In [57]:
embedding_index['black']

array([ 2.5241e-01, -6.7137e-01, -5.2450e-01, -7.7121e-02,  6.2402e-01,
        2.5090e-01, -9.2188e-02,  2.2020e-01, -3.9541e-01, -1.6472e-01,
        2.7456e-01,  1.5186e-02, -2.2144e-02, -4.1250e-01,  7.9375e-01,
        1.6623e-01, -1.5474e-01, -3.5807e-02,  8.7393e-02,  7.9398e-01,
        1.3300e-01,  2.2412e+00, -6.6926e-01, -2.6821e-01,  3.0094e-01,
       -5.3784e-01, -4.3042e-01, -5.5305e-01, -6.2330e-01,  5.6579e-01,
        4.3139e-01, -1.7781e-01,  2.3002e-01,  6.6689e-01, -1.5991e-01,
        1.5221e-01, -8.3503e-02, -5.9459e-01,  3.5601e-01, -9.4554e-02,
       -2.3846e-01, -4.4948e-02, -1.3009e-02,  2.5178e-01, -4.1332e-01,
        8.1200e-02,  9.1443e-01, -4.7747e-01,  4.0658e-01,  2.5067e-01,
        4.2417e-01,  3.0244e-01, -4.9033e-02,  1.9272e-01,  5.0650e-01,
       -1.2527e-01, -7.7126e-01, -4.4833e-02,  4.7449e-02, -4.9527e-01,
       -2.1143e-01,  1.0104e-01, -5.8837e-01, -4.3875e-02, -3.0318e-01,
        1.2172e-01, -2.2696e-01,  8.2392e-01,  4.6801e-01, -3.53

In [58]:
embedding_matrix= np.zeros((len(word_index)+1,200))

In [59]:
embedding_matrix.shape

(3134, 200)

In [60]:
for word in word_index.items():
    word, i = word
    embedding_vector= embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i-1]= embedding_vector

In [64]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,Dropout,Conv1D,MaxPool1D

In [65]:
from keras.optimizers import Adam

In [75]:
rnn= Sequential()
rnn.add(Embedding(input_dim=vocab_size+1,
                 output_dim=200,
                 input_length=50,
                 weights=[embedding_matrix],
                 trainable=False
                 ))
rnn.add(Conv1D(filters=50, kernel_size=3, activation='relu'))
rnn.add(MaxPool1D())
rnn.add(LSTM(units=50, activation='relu'))
rnn.add(Dense(units=20,activation='relu'))
rnn.add(Dense(units=1, activation='sigmoid'))

rnn.compile(loss='binary_crossentropy',optimizer= Adam(learning_rate=1e-5), metrics=['acc'])

In [76]:
rnn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 200)           626800    
                                                                 
 conv1d_2 (Conv1D)           (None, 48, 50)            30050     
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 24, 50)            0         
 g1D)                                                            
                                                                 
 lstm_2 (LSTM)               (None, 50)                20200     
                                                                 
 dense_4 (Dense)             (None, 20)                1020      
                                                                 
 dense_5 (Dense)             (None, 1)                 21        
                                                      

In [78]:
rnn.fit(padded_tokens,y, epochs=80, validation_split=0.15)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80


Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.src.callbacks.History at 0x1dba4afe2c0>