In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("C:/Users/TANNERU/Downloads/Dataset for coding/imdb_labelled.txt",delimiter = '\t',names = ['Reviews','Status'])

In [3]:
df

Unnamed: 0,Reviews,Status
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [4]:
df['Reviews'][74]

'I could not stand to even watch it for very long for fear of losing I.Q.  '

In [5]:
from collections import Counter
print(Counter(df['Status']))

Counter({1: 386, 0: 362})


In [6]:
df.isnull().sum()

Reviews    0
Status     0
dtype: int64

In [7]:
X = df['Reviews']
Y = df['Status']

In [8]:
X

0      A very, very, very slow-moving, aimless movie ...
1      Not sure who was more lost - the flat characte...
2      Attempting artiness with black & white and cle...
3           Very little music or anything to speak of.  
4      The best scene in the movie was when Gerardo i...
                             ...                        
743    I just got bored watching Jessice Lange take h...
744    Unfortunately, any virtue in this film's produ...
745                     In a word, it is embarrassing.  
746                                 Exceptionally bad!  
747    All in all its an insult to one's intelligence...
Name: Reviews, Length: 748, dtype: object

In [9]:
Y

0      0
1      0
2      0
3      0
4      1
      ..
743    0
744    0
745    0
746    0
747    0
Name: Status, Length: 748, dtype: int64

# Data Cleaning

In [10]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [11]:
ps = PorterStemmer()
lemma = WordNetLemmatizer()
corpus = []

In [12]:
for i in range(0,len(X)):
    review = re.sub('[^a-zA-Z]',' ',X[i])
    review = review.lower()
    review = review.split()
    #stemming
    #review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    #lemmatising
    review = [lemma.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    

In [13]:
corpus

['slow moving aimless movie distressed drifting young man',
 'sure lost flat character audience nearly half walked',
 'attempting artiness black white clever camera angle movie disappointed became even ridiculous acting poor plot line almost non existent',
 'little music anything speak',
 'best scene movie gerardo trying find song keep running head',
 'rest movie lack art charm meaning emptiness work guess empty',
 'wasted two hour',
 'saw movie today thought good effort good message kid',
 'bit predictable',
 'loved casting jimmy buffet science teacher',
 'baby owl adorable',
 'movie showed lot florida best made look appealing',
 'song best muppets hilarious',
 'cool',
 'right case movie delivers everything almost right face',
 'average acting main person low budget clearly see',
 'review long overdue since consider tale two sister single greatest film ever made',
 'put gem movie term screenplay cinematography acting post production editing directing aspect film making',
 'practically

In [14]:
len(corpus)

748

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [21]:
t = Tokenizer()
t.fit_on_texts(corpus)

In [22]:
t.word_index

{'movie': 1,
 'film': 2,
 'one': 3,
 'bad': 4,
 'character': 5,
 'good': 6,
 'like': 7,
 'time': 8,
 'acting': 9,
 'really': 10,
 'great': 11,
 'even': 12,
 'see': 13,
 'well': 14,
 'scene': 15,
 'story': 16,
 'make': 17,
 'ever': 18,
 'actor': 19,
 'plot': 20,
 'made': 21,
 'best': 22,
 'work': 23,
 'also': 24,
 'thing': 25,
 'script': 26,
 'way': 27,
 'would': 28,
 'love': 29,
 'seen': 30,
 'look': 31,
 'watching': 32,
 'think': 33,
 'real': 34,
 'could': 35,
 'every': 36,
 'much': 37,
 'get': 38,
 'show': 39,
 'line': 40,
 'funny': 41,
 'year': 42,
 'better': 43,
 'cast': 44,
 'performance': 45,
 'never': 46,
 'wonderful': 47,
 'little': 48,
 'give': 49,
 'watch': 50,
 'everything': 51,
 'go': 52,
 'excellent': 53,
 'anyone': 54,
 'part': 55,
 'totally': 56,
 'know': 57,
 'music': 58,
 'art': 59,
 'thought': 60,
 'waste': 61,
 'say': 62,
 'people': 63,
 'screen': 64,
 'life': 65,
 'nothing': 66,
 'stupid': 67,
 'director': 68,
 'awful': 69,
 'still': 70,
 'many': 71,
 'man': 72,
 't

In [23]:
len(t.word_index) #unique words in corpus

2720

In [24]:
vocab_size = len(t.word_index)+1

In [25]:
vocab_size

2721

In [28]:
#text to sequences converting words into integers
encoded_docs = t.texts_to_sequences(corpus)
print(encoded_docs)

[[194, 301, 1038, 1, 1039, 1040, 302, 72], [414, 303, 618, 5, 239, 1041, 195, 619], [1042, 1043, 100, 117, 240, 101, 415, 1, 196, 620, 12, 241, 9, 197, 20, 40, 118, 242, 1044], [48, 58, 135, 416], [22, 15, 1, 1045, 304, 89, 136, 198, 621, 305], [306, 1, 167, 59, 622, 307, 1046, 23, 243, 623], [168, 73, 119], [80, 1, 308, 60, 6, 624, 6, 1047, 120], [244, 102], [103, 199, 625, 1048, 1049, 1050], [1051, 1052, 309], [1, 417, 81, 1053, 22, 21, 31, 626], [136, 22, 1054, 200], [169], [74, 418, 1, 419, 51, 118, 74, 201], [420, 9, 421, 1055, 245, 170, 627, 13], [422, 137, 1056, 246, 310, 311, 73, 423, 247, 424, 2, 18, 21], [248, 425, 1, 628, 1057, 104, 9, 1058, 249, 202, 121, 426, 2, 203], [1059, 204, 312, 313, 427, 428, 313], [1060, 2, 314, 1061, 1062, 205, 138, 33, 2, 315, 1063, 429, 1064, 36, 82, 139, 1065, 206, 2, 90, 314, 1066, 1067, 2, 54, 74, 250, 629, 135, 1, 105, 106, 1068, 1069, 630, 138, 631, 430, 2, 1070, 207, 431, 251, 1071, 1072, 83, 632, 171, 633, 91, 252, 92, 2, 140, 432, 1073, 

# PADDING THE DOCS (to make every doc of same length)

In [29]:
max_length = 55

In [31]:
padded_docs = pad_sequences(encoded_docs,padding = 'post',maxlen = max_length)

In [32]:
padded_docs

array([[ 194,  301, 1038, ...,    0,    0,    0],
       [ 414,  303,  618, ...,    0,    0,    0],
       [1042, 1043,  100, ...,    0,    0,    0],
       ...,
       [ 139,  465,    0, ...,    0,    0,    0],
       [2720,    4,    0, ...,    0,    0,    0],
       [ 395,    3,  377, ...,    0,    0,    0]])

# load the whole embedding into memory


In [33]:
embedding_index = dict() #separating text and values into dictionary 

In [35]:
f = open("C:/Users/TANNERU/Downloads/glove.6B/glove.6B.100d.txt",encoding = "utf8")

In [36]:
for line in f:
    values = line.split()
    word = values[0] #words
    coefs = np.asarray(values[1:],dtype = 'float32') #vectors
    embedding_index[word] = coefs
f.close()    

In [37]:
values

['sandberger',
 '0.28365',
 '-0.6263',
 '-0.44351',
 '0.2177',
 '-0.087421',
 '-0.17062',
 '0.29266',
 '-0.024899',
 '0.26414',
 '-0.17023',
 '0.25817',
 '0.097484',
 '-0.33103',
 '-0.43859',
 '0.0095799',
 '0.095624',
 '-0.17777',
 '0.38886',
 '0.27151',
 '0.14742',
 '-0.43973',
 '-0.26588',
 '-0.024271',
 '0.27186',
 '-0.36761',
 '-0.24827',
 '-0.20815',
 '0.22128',
 '-0.044409',
 '0.021373',
 '0.24594',
 '0.26143',
 '0.29303',
 '0.13281',
 '0.082232',
 '-0.12869',
 '0.1622',
 '-0.22567',
 '-0.060348',
 '0.28703',
 '0.11381',
 '0.34839',
 '0.3419',
 '0.36996',
 '-0.13592',
 '0.0062694',
 '0.080317',
 '0.0036251',
 '0.43093',
 '0.01882',
 '0.31008',
 '0.16722',
 '0.074112',
 '-0.37745',
 '0.47363',
 '0.41284',
 '0.24471',
 '0.075965',
 '-0.51725',
 '-0.49481',
 '0.526',
 '-0.074645',
 '0.41434',
 '-0.1956',
 '-0.16544',
 '-0.045649',
 '-0.40153',
 '-0.13136',
 '-0.4672',
 '0.18825',
 '0.2612',
 '0.16854',
 '0.22615',
 '0.62992',
 '-0.1288',
 '0.055841',
 '0.01928',
 '0.024572',
 '0.46

In [38]:
word

'sandberger'

In [39]:
coefs

array([ 0.28365  , -0.6263   , -0.44351  ,  0.2177   , -0.087421 ,
       -0.17062  ,  0.29266  , -0.024899 ,  0.26414  , -0.17023  ,
        0.25817  ,  0.097484 , -0.33103  , -0.43859  ,  0.0095799,
        0.095624 , -0.17777  ,  0.38886  ,  0.27151  ,  0.14742  ,
       -0.43973  , -0.26588  , -0.024271 ,  0.27186  , -0.36761  ,
       -0.24827  , -0.20815  ,  0.22128  , -0.044409 ,  0.021373 ,
        0.24594  ,  0.26143  ,  0.29303  ,  0.13281  ,  0.082232 ,
       -0.12869  ,  0.1622   , -0.22567  , -0.060348 ,  0.28703  ,
        0.11381  ,  0.34839  ,  0.3419   ,  0.36996  , -0.13592  ,
        0.0062694,  0.080317 ,  0.0036251,  0.43093  ,  0.01882  ,
        0.31008  ,  0.16722  ,  0.074112 , -0.37745  ,  0.47363  ,
        0.41284  ,  0.24471  ,  0.075965 , -0.51725  , -0.49481  ,
        0.526    , -0.074645 ,  0.41434  , -0.1956   , -0.16544  ,
       -0.045649 , -0.40153  , -0.13136  , -0.4672   ,  0.18825  ,
        0.2612   ,  0.16854  ,  0.22615  ,  0.62992  , -0.1288

In [40]:
embedding_index

{'the': array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  

In [41]:
len(embedding_index)

400000

# create a weight matrix for words in training docs

In [50]:
embedded_matrix = np.zeros((vocab_size,100))
print(embedded_matrix) #initial weights
print(embedded_matrix.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(2721, 100)


In [51]:
t.word_index.items()



# If input word(t.word_index)(unique words [2720]) is present in predefined vector(embedding_index[400000]) then particular wordvectors will be placed in embedded_vector 

In [47]:
for word,i in t.word_index.items():
    embedded_vector = embedding_index.get(word) #embedded_index have predefined words
    print("Embedded vector",i,embedded_vector) #only existed words in doc will be picked and converted into vectors
    if embedded_vector is not None:
        embedded_matrix[i] = embedded_vector #only existed words in doc will be entered 

Embedded vector 1 [ 0.38251    0.14821    0.60601   -0.51533    0.43992    0.061053
 -0.62716   -0.025385   0.1643    -0.22101    0.14423   -0.37213
 -0.21683   -0.08895    0.097904   0.6561     0.64455    0.47698
  0.83849    1.6486     0.88922   -0.1181    -0.012465  -0.52082
  0.77854    0.48723   -0.014991  -0.14127   -0.34747   -0.29595
  0.1028     0.57191   -0.045594   0.026443   0.53816    0.32257
  0.40788   -0.043599  -0.146     -0.48346    0.32036    0.55086
 -0.76259    0.43269    0.61753   -0.36503   -0.60599   -0.79615
  0.3929    -0.23668   -0.34719   -0.61201    0.54747    0.94812
  0.20941   -2.7771    -0.6022     0.8495     1.2549     0.017893
 -0.041901   2.1147    -0.026618  -0.28104    0.68124   -0.14165
  0.99249    0.49879   -0.67538    0.6417     0.42303   -0.27913
  0.063403   0.68909   -0.36183    0.053709  -0.16806    0.19422
 -0.47073   -0.14803   -0.58986   -0.2797     0.16792    0.10568
 -1.7601     0.0088254 -0.83326   -0.5836    -0.37079   -0.56591
  0.2

Embedded vector 126 [-0.044891   0.27732    0.65209   -0.47268   -0.3768     0.74647
 -0.66873   -0.80967   -0.07338   -0.21013    0.069419  -0.57066
 -0.30087   -0.17311    0.28795    0.67837   -0.17238    0.55991
 -0.19411    0.71985    0.59585   -0.12383   -0.17488   -0.6351
  0.69061    0.53664    0.88466    0.053806  -0.037558  -0.33147
  0.026341  -0.44602   -1.1884     0.58497   -0.21022   -0.079939
  0.4932    -0.31713    0.085024  -0.22133   -0.33963    0.93798
 -1.0101     0.041509   0.859      0.11375    0.16966   -0.2173
  0.13943   -0.0070975 -0.33197   -0.18644    0.55696    0.6065
 -0.0076038 -1.8607    -0.0086177  1.1067     0.41222    0.1936
  0.49864    2.0962    -0.024192  -0.92603    0.62935   -0.42453
  0.081083   0.4066    -0.50426    0.22209    0.12955   -0.016974
 -0.1011     0.86758   -0.55497    1.0867     0.064534   0.38904
 -0.49881    0.19787   -0.96866    0.88081    0.40482    0.19615
 -1.1986     0.038439  -1.056     -0.43695   -0.54982   -0.10576
  0.265

Embedded vector 321 [ 0.11027    0.12528    0.52154    0.25976   -0.28959   -0.71529
 -0.12052   -0.16175   -0.54635    0.54048   -0.38007   -0.65695
  0.34363   -0.21964   -0.28974    0.04981    0.75925    0.1586
  0.43551    0.67117   -0.75381   -0.35164    0.1692    -0.67015
 -0.080084   0.22322    0.11154   -0.34223   -0.87797   -0.10484
  0.011847   0.17752   -0.40326    0.21346    0.29324    0.35224
 -0.41742    0.51445   -0.57588   -0.53941    0.23461   -0.82719
 -0.3761    -0.16665    0.22844    0.14673   -0.031256  -0.75656
 -0.1011    -1.1077     0.38544   -0.26782   -0.62168    1.1614
  0.56836   -1.5594     0.18353    0.11435    1.84      -0.67406
  0.0077371  0.43357   -0.53177    0.17631    0.41748   -0.098924
  0.62146   -0.38584    0.80435   -0.32965    0.40265    0.53846
  0.78439    0.26544    1.135      0.51305   -0.30694    0.1659
 -0.17457   -0.26243    0.59419    0.19017   -0.55812   -0.036603
 -1.6606     0.16143    0.047633  -0.37594   -0.6824     0.20617
 -0.00

  0.04724   0.10526 ]
Embedded vector 491 [ 5.4129e-02  1.0953e-01  5.7768e-01 -2.2635e-01  4.6280e-02  2.4222e-01
  6.2429e-01 -1.5542e-01  1.5023e-01 -5.6989e-01 -3.5572e-01  2.8695e-01
  1.0965e-01 -5.1538e-01 -4.1871e-01 -6.7057e-01 -5.6798e-01  4.4960e-01
 -1.6837e-01 -4.0055e-01  5.8412e-01  3.2253e-01  2.9401e-02 -7.5277e-01
  1.9085e-01  3.5249e-02 -3.3146e-01 -2.7261e-01 -5.7252e-02 -4.2639e-01
  1.9678e-01  3.6660e-01 -2.9605e-01 -4.0085e-02  9.9479e-02  1.1189e-01
  6.7101e-02  1.4082e-01  8.8483e-02 -4.0371e-02 -1.0580e-01 -1.4088e-01
  1.7734e-01 -2.4913e-01 -2.0326e-01 -6.5332e-01  8.8327e-01  6.1646e-01
  4.9344e-01 -1.2595e+00  5.9360e-01 -4.7454e-01  4.3042e-01  8.5967e-01
  4.3625e-01 -2.2858e+00 -2.2247e-01 -1.7611e-01  4.6576e-01  6.7818e-01
 -3.9066e-01  9.8054e-01 -2.2480e-01 -1.8678e-01  2.9886e-01 -2.0195e-02
  4.0103e-01  1.0659e-01 -8.8463e-01 -4.5075e-01  1.6518e-01 -6.3500e-01
 -2.6219e-01 -3.4824e-01  8.9664e-02  5.3438e-02  3.2052e-02  2.1699e-01
 -4.2662e

Embedded vector 693 [-0.29839    0.0036163  0.81085    0.027314   0.2079    -0.088915
 -0.2722    -0.14464   -0.4439     0.1421    -0.040629  -0.1868
 -0.01937   -0.15503    0.62717   -0.029221   0.028586  -0.15557
  0.11862   -0.32617    1.0854     0.59032   -0.0052426  0.070958
  0.66495    0.16268    0.32607   -0.28314   -0.37968   -0.79787
 -0.28109    0.072966   0.12214    0.28593    0.6011    -0.62015
 -0.1144     0.29296    0.11114    0.016342  -0.097327  -0.74127
 -0.31452   -0.25591   -0.25934   -0.0056316  0.41145    0.59522
  0.12649    0.24614   -0.097224   0.34591   -0.28068    1.0127
  0.40494   -2.7599    -0.18073   -0.507      1.3143     0.10287
  0.14041    0.26808   -0.14361    0.39276    0.66687    0.041324
  0.37645   -0.018096  -0.34211    0.036635  -0.39831    0.52265
  0.10317   -0.4301    -0.066797  -0.42452   -0.090356   0.12108
  0.73621    0.22797   -0.083356   0.54682   -0.86379   -0.31559
 -0.5713    -0.51378   -0.047813   0.25538    0.26256    0.24619
  0.

Embedded vector 874 [ 0.0048804  0.31469    0.19325   -0.047789  -0.22479    0.43022
 -0.058098   0.55989    0.15602   -0.2351    -0.019022  -0.33303
  0.33849   -0.037939   0.38062    0.41314   -0.47685    0.11811
  0.3795     0.41485    0.83867    0.52113    0.18878   -1.0175
  0.35267   -0.27902    0.16301    0.25699    0.050566  -0.39142
 -0.38295    0.30774   -0.20742   -0.94649   -0.53036   -0.083107
 -0.47535   -0.049267   0.52523   -0.71562    0.45568    0.25534
  0.1766    -0.32326   -0.59447   -0.1981     0.11952    0.59059
 -0.18767   -0.1462     0.51313   -0.64755    0.50379    0.42786
 -0.01228   -1.7156     0.4071     0.36306    0.4309     0.26986
 -0.1769     0.68684   -0.09761    0.13779    0.89632   -0.36623
  0.48564   -0.14347    0.14562   -0.18208   -0.55907   -0.30611
 -0.20316   -0.26548    0.36645    0.43384   -0.0043283 -0.90248
  0.014436   0.076662  -0.24769   -0.21782    0.072502  -0.47112
 -0.81004   -0.08936    0.27808   -0.49214   -0.56781   -0.66936
 -0.4

 -0.42308  -0.4433  ]
Embedded vector 1021 [-0.25605    0.46474    0.47019   -1.5409    -1.0843    -0.16405
 -0.40833   -0.3322     0.40709    0.26608   -0.07474    0.19517
  0.24954    0.90794   -0.10848    0.18949   -0.62853    1.1102
 -0.23334    0.11761   -0.32188    0.16102   -0.04272   -0.20276
  0.08759    0.48129   -0.70305    0.96533   -0.076782  -0.40508
 -0.11438    0.042529   0.56612    0.14392   -0.64297    0.48263
 -0.18307   -0.21734    0.96683    0.010724   0.25101    0.03074
 -0.51376   -0.73234   -0.037585   0.33503    0.44824   -0.1469
 -0.74437   -0.13101   -0.053575  -0.13894   -0.35747    0.35817
 -1.2018    -0.30988    0.14689    0.38411   -0.57586   -0.58182
  0.037076   1.2219    -0.67244    0.39173   -0.20644    0.13477
  0.477     -0.50361    0.25416   -0.34319    0.38703   -0.75649
  0.18366   -0.25418   -0.87924    0.35672   -0.062137  -0.4093
 -0.40778    0.76071    0.36051   -0.45924   -0.023106   0.27301
 -0.14318   -0.43146    0.018417   0.0046851 -0.19

 -0.17315    0.34338    0.29758    0.94414  ]
Embedded vector 1133 [ 0.023712   0.3673     0.72884   -0.35047    0.029986   1.0109
 -0.098471   0.057833  -0.67245    0.65118   -0.64824   -0.043116
 -0.37841   -0.65556    0.39906   -0.038766  -0.23261    0.25221
  0.52486    0.44491   -0.012017  -0.68195   -0.31149   -0.48794
 -0.11066    0.38812    0.079288  -0.24138    0.30269   -0.20674
  0.17807    0.22173   -0.55944   -0.40908   -0.1326    -0.064945
  0.018746  -0.15021    0.021723  -0.048242   0.067057  -0.67632
  0.5378     0.025676   0.12692   -0.013018   0.86542   -0.1171
  0.27842    0.19131    0.0036589 -0.74676   -0.08144    0.1006
  0.53776   -0.86404   -0.18909   -0.11834    0.12811   -0.33631
  0.43861    0.41711   -0.2599    -0.15276    0.32699    0.36773
  1.0754    -0.46104    0.1172    -0.030852   1.166     -0.16864
 -0.67569   -0.40838   -0.19621    0.090125  -0.099404  -0.16442
  0.024844  -0.79123    0.16777    0.61687    0.6271     0.13203
 -0.82293   -0.85723   -

  0.38038   -0.45055    0.4831    -0.0066065]
Embedded vector 1248 [ 0.26582   0.031967  0.44848  -0.9018   -0.94648   0.34442  -0.29809
  0.291     0.52952  -0.087822  0.047245 -0.066301  0.20218   0.20225
  0.7133    1.0266   -0.55837   0.11656   0.47346   0.37375   0.6703
  0.29351  -0.024912  0.070651  0.57491  -0.12923  -0.31424   0.14851
 -0.052807  0.16968   0.27143   0.073211  0.58492  -0.49478  -0.46697
 -0.46938  -0.53379  -0.48424   0.62557   0.021208  0.08845  -0.18254
 -0.26787  -0.72378  -0.51485   0.40463   0.40184   0.87136  -0.33877
 -0.35486   0.05475  -0.45031   0.016127  0.82024  -0.053093 -1.6258
  0.53884   0.91028   0.24317  -0.4293   -0.45226   0.719    -0.77261
 -0.63545   0.78705  -0.58804   0.65244  -0.2961   -0.59294  -0.48271
  0.20754  -0.1759   -0.29636   0.41624   0.17611   0.34145  -0.49378
  0.3136   -0.45327   0.28334  -0.039159  0.07691   0.39651   0.73837
 -0.91206  -0.77813  -0.76238  -0.032103 -0.13293  -0.36559  -0.10129
  0.12534  -0.28914   0.0

  0.090484  -0.91054   -0.28856   -0.54336  ]
Embedded vector 1414 [ 1.9401e-01  4.6954e-01  2.1925e-01  1.0986e-01  3.5679e-02  1.9633e-01
  1.5170e-01 -4.1302e-01  1.5006e-01  4.3661e-01 -7.4730e-01  4.5474e-01
  1.0219e-01 -3.0495e-01  1.9190e-01  5.0471e-01 -3.7685e-01 -3.8382e-01
  4.8297e-01  5.7597e-02 -2.7579e-01 -1.8540e-02 -6.9103e-01  3.7660e-02
 -1.6396e-01  1.9373e-01 -1.8677e-01  5.2174e-01 -1.2614e-02 -3.3172e-02
 -2.4661e-01 -2.5671e-01  1.7613e-01 -3.0025e-01 -1.4400e-01  3.8205e-01
 -1.5879e-01 -4.8437e-01 -1.1099e-01  1.1489e-01  2.5197e-01  5.1129e-01
  1.0066e-03  1.3069e-01  1.2413e-02 -3.8907e-02  3.0224e-01  3.4033e-02
 -8.9246e-02  5.8360e-02  6.0731e-02 -4.8427e-04  3.0538e-01 -1.4893e-01
  1.5501e-01  5.3806e-01 -9.4865e-02  1.3861e-01 -5.5510e-01 -2.3092e-01
  1.3667e-01  2.3200e-03  2.4906e-01  1.1881e-01 -2.5849e-01 -2.5784e-01
 -5.1966e-02 -1.9138e-01 -3.6100e-02 -1.3585e-01 -3.1332e-01  3.2018e-01
  3.7551e-01  3.8078e-01 -4.0996e-01  3.1485e-01  1.9463e

Embedded vector 1667 [ 3.2869e-01 -3.1843e-01 -4.6024e-01 -2.6453e-01 -5.4615e-01  7.4404e-01
 -3.4619e-01 -5.7176e-02  2.3178e-01  8.8329e-01 -6.2536e-02 -2.2014e-01
  1.4539e+00  6.4922e-01 -3.6561e-01  9.3004e-01 -3.9273e-01  1.1654e-01
  3.8284e-01 -3.3916e-01  4.1132e-02 -4.8270e-01 -3.4427e-01  6.8924e-02
  1.9440e+00  6.3818e-01 -8.1099e-01  8.3885e-02  4.4708e-01 -9.5219e-02
  9.1653e-02  1.6104e-01 -1.6062e-01 -4.6006e-02  3.0909e-01 -3.7644e-01
 -3.1162e-01  1.7984e-01 -1.0469e-02  1.6810e-01  6.8183e-02  5.7414e-01
  1.2258e+00 -3.8323e-01 -6.6894e-01  3.7299e-01  6.9312e-01  6.4779e-01
  6.4157e-01  1.4804e-01 -1.6723e-01  3.9143e-01 -3.8550e-01  1.3943e-01
  4.9601e-02 -3.0167e-01  6.8126e-01 -6.2890e-01 -5.2278e-01  5.9679e-01
  2.0203e-01  4.4787e-01 -3.1534e-01  4.2093e-01 -5.0498e-01  4.4137e-01
  1.9467e-01 -3.1089e-01  2.8824e-01 -7.2757e-01 -8.2057e-01  1.1398e+00
  3.4194e-01  6.2226e-02 -1.5929e-01 -7.5075e-01 -6.9355e-01  2.5918e-01
  2.6947e-01  4.3655e-01  3.44

Embedded vector 1905 [-0.67435    0.30851    0.45354    0.078017   0.45661    0.073945
  0.9435     0.28823   -0.14282   -0.39265   -0.51403   -0.11492
 -0.40697   -0.59128    0.17426   -0.69891    0.3679     0.098312
  0.087608   0.049617   0.1975    -0.16867    0.17453    0.0088781
 -1.0117     0.39352   -0.52413    0.02272   -0.17446   -0.55443
  0.83609    0.46363    0.66021    0.75501   -0.44867    0.41506
  0.071644  -0.11332    0.33355    1.073     -1.2838    -0.78193
  0.24192   -0.17224   -1.0407    -1.2598    -0.27248   -0.51911
 -0.74108   -0.88163    0.24795   -0.23777    0.71768    0.74401
  0.4019    -1.652      0.62756    0.36887    0.15295   -0.042849
 -0.65346    0.061075   0.23896   -0.25254    0.37654   -0.034277
  0.81741    0.6388    -0.42865    0.58194    0.077566   0.19351
 -0.34079   -0.14713   -0.020563  -0.13068   -0.50105   -0.046643
 -0.37523    0.24229    0.18583   -0.19054   -0.062925  -0.89412
 -1.3602    -0.67006   -0.35967    0.40323   -0.036327  -0.960

Embedded vector 2096 [ 0.34848  -0.91393  -0.87709   0.36184   0.56103   0.62343   0.046943
 -0.27478  -0.066835  0.8279    0.093259  0.56816  -0.58018   0.41772
  0.018743  0.25433   0.10109   0.35892  -1.3218    0.056968  0.70941
 -0.62792   0.31457  -0.3523   -0.034204  0.013654 -0.55617  -0.29561
  0.72949   0.13666  -0.16627   0.39181  -0.2643    0.034605  0.23235
 -0.4351    0.12392  -0.32563   0.30615   0.49333  -0.97071   0.17673
  0.21228  -0.39371  -0.22386   0.34595   0.48994  -0.57737   0.34368
 -0.312    -0.20262   0.41244   0.15661   1.2406    0.29615  -1.4512
 -0.52783  -0.31541   1.0313    0.37588  -0.59225  -0.088217 -0.16681
  0.1447    0.63221  -0.51173   0.016224  0.054931 -0.42148   0.070903
 -0.70247  -0.52958  -0.37104  -0.14425  -0.5549   -0.10972  -0.15379
  0.34237  -0.6642    0.55252   0.35066   0.57869  -0.12892   0.013971
 -1.2053   -0.7456    0.50891   0.038668 -0.18103  -0.016458  0.12049
 -0.21103  -0.26155  -0.47164   0.01758  -0.29169  -0.357     0.067

Embedded vector 2405 [-0.10456    0.29392    0.20386   -0.038821  -0.25374    0.73236
 -0.82674    0.31283   -0.44945    1.4324    -0.038012  -0.19858
 -0.0087159 -0.44044    0.19262    0.3668     0.14362   -0.36981
 -0.056881  -0.38099    0.16392   -0.17379    0.020145   0.031018
  0.1078    -0.13528   -0.19367   -0.47262   -1.0204    -0.14485
 -0.13602   -0.32875   -0.33129   -0.54985   -0.18565    1.0956
  0.071301  -0.23533   -0.31139   -0.31143    0.15521    0.22784
  0.65086    1.2724    -0.15606   -0.17818    0.85647    0.51823
  0.17358   -0.055951   0.16579   -0.42136    0.76095    0.11957
 -0.40353   -0.98162    0.89553    0.071411   0.071063  -0.45525
  0.38355   -0.0025622 -0.21588   -0.54569   -0.039184  -0.76807
 -0.042425  -0.77981   -0.34752   -0.79868    0.60912   -0.36916
 -0.27082   -0.21596   -0.23921   -0.45657   -0.72513   -0.07795
  0.55874   -0.24733    0.070023   0.27317   -0.02261    0.47785
 -0.16652    0.34162    0.37845   -0.02944   -0.59701   -0.49507
  1.

Embedded vector 2571 [-0.46835    0.70448    0.28054    0.78727   -0.085869  -0.20079
  0.13266   -0.23019   -0.19455    0.5963     0.11      -0.79626
  0.080527  -0.35998    0.70625    1.1541    -0.19584    1.2139
  0.7855    -0.48456    0.15364    0.16442    0.064267  -0.47492
  0.67388   -0.33961    0.81477    0.46296   -0.55125   -0.20545
  0.58912   -0.13096   -1.4764    -0.3066     0.86716   -0.062808
 -0.32913    0.4994     0.43517   -0.23186    0.53149   -0.3394
 -0.27513    0.16531   -0.54861    0.84449   -0.32845    0.16107
 -0.025535   0.82801   -0.12147   -0.67306    0.47766    0.60095
 -0.030988  -1.2335     0.98165    0.23801    0.98319   -0.044114
 -0.67188    0.36631   -0.58193   -0.090687   0.52437    0.80685
  0.7972     0.37681   -0.25478   -0.059398   0.38467    0.0037173
  0.58755   -0.1697     0.6769    -0.38753    0.18227   -0.35825
  0.28123   -0.036958   0.052868   0.13458   -0.21264    0.051492
 -0.72164    0.76069    0.19711   -0.2711     0.031477  -0.61006
 

In [49]:
embedded_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.38251001,  0.14821   ,  0.60601002, ...,  0.058921  ,
         0.091112  ,  0.47283   ],
       [ 0.19915999, -0.049702  ,  0.24579   , ..., -0.068109  ,
         0.017651  ,  0.06455   ],
       ...,
       [ 0.26668999, -0.35774001,  0.66610003, ..., -1.22249997,
         0.12735   ,  0.28698   ],
       [-0.67141002, -0.51441002,  0.55374998, ..., -0.72223997,
        -0.71516001,  0.25769001],
       [-0.77494001, -0.057527  , -0.23492   , ..., -0.50869   ,
        -0.75295001, -0.31467   ]])

In [52]:
print(embedded_matrix.shape)

(2721, 100)


# Building model

In [182]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import SimpleRNN
from keras.layers import Dropout
from keras.layers.embeddings import Embedding

In [183]:
embedded_dim = 100

# SIMPLE RNN

In [222]:
model = Sequential()
model.add(Embedding(vocab_size,embedded_dim,input_length = max_length))
#model.add(Embedding(vocab_size,embedded_dim,weights = [embedded_matrix],input_length = max_length,trainable = False))
model.add(Dropout(0.3))
model.add(SimpleRNN(100,activation="tanh"))
model.add(Dropout(0.3))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])

In [223]:
model.summary()

Model: "sequential_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_30 (Embedding)     (None, 55, 100)           272100    
_________________________________________________________________
dropout_60 (Dropout)         (None, 55, 100)           0         
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dropout_61 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 101       
Total params: 292,301
Trainable params: 292,301
Non-trainable params: 0
_________________________________________________________________


In [224]:
from keras.callbacks import ModelCheckpoint

In [225]:
checkpoint = ModelCheckpoint('C:/Users/TANNERU/Downloads/Checkpoints/imdb_checkpoint.h5', verbose=1, monitor='val_acc',save_best_only=True,save_weights_only =True, mode='auto')

In [226]:
inputs = np.array(padded_docs)
Y = np.array(Y)

In [230]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(inputs,Y,test_size = 0.25,random_state = 42) 

In [231]:
print(Xtrain.shape)
print(Xtest.shape)
print(Ytrain.shape)
print(Ytest.shape)

(561, 55)
(187, 55)
(561,)
(187,)


In [232]:
glove_history = model.fit(Xtrain,Ytrain,validation_data = (Xtest,Ytest),epochs = 10,batch_size = 32,callbacks = [checkpoint])

Train on 561 samples, validate on 187 samples
Epoch 1/10

Epoch 00001: val_acc improved from 0.55080 to 0.84492, saving model to C:/Users/TANNERU/Downloads/Checkpoints/imdb_checkpoint.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.84492 to 0.87701, saving model to C:/Users/TANNERU/Downloads/Checkpoints/imdb_checkpoint.h5
Epoch 3/10

Epoch 00003: val_acc improved from 0.87701 to 0.90374, saving model to C:/Users/TANNERU/Downloads/Checkpoints/imdb_checkpoint.h5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.90374
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.90374
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.90374
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.90374
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.90374
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.90374
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.90374


# LSTM

In [289]:


model1 = Sequential()
model1.add(Embedding(vocab_size,embedded_dim,input_length = max_length))
#model.add(Embedding(vocab_size,embedded_dim,weights = [embedded_matrix],input_length = max_length,trainable = False))
#model.add(Dropout(0.3))
#1st layer
model1.add(LSTM(50,activation = 'tanh',kernel_initializer="glorot_uniform",return_sequences = True))
#model.add(Dropout(0.3))
#2nd layer
model1.add(LSTM(50,activation = 'tanh',kernel_initializer="glorot_uniform",return_sequences = True))
#model.add(Dropout(0.3))
#3rd layer
model1.add(LSTM(50,activation = 'tanh',kernel_initializer="glorot_uniform",return_sequences = False))
#model.add(Dropout(0.3))
model1.add(Dense(1,activation = 'sigmoid'))
model1.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])

In [290]:
history = model1.fit(Xtrain,Ytrain,validation_data = (Xtest,Ytest),epochs = 10,batch_size = 32)

Train on 561 samples, validate on 187 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# GRU

In [200]:
from keras.layers import GRU

In [292]:
model2 = Sequential()
model2.add(Embedding(vocab_size,embedded_dim,input_length = max_length))
#model2.add(Embedding(vocab_size,embedded_dim,weights = [embedded_matrix],input_length = max_length,trainable = False))
#model2.add(Dropout(0.3))
#1st layer 
#model2.add(GRU(50,activation = 'tanh',kernel_initializer="glorot_uniform",return_sequences=True))
#model2.add(Dropout(0.3))
#2nd layer
#model2.add(GRU(50,activation="tanh",kernel_initializer="glorot_uniform",return_sequences=True))
#model2.add(Dropout(0.3))
#3rd layer
model2.add(GRU(100,activation = 'tanh',kernel_initializer="glorot_uniform",return_sequences=False))
#model2.add(Dropout(0.3))
model2.add(Dense(1,activation = 'sigmoid'))
model2.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])

In [293]:
gru = model2.fit(Xtrain,Ytrain,validation_data = (Xtest,Ytest),epochs = 10,batch_size = 32)

Train on 561 samples, validate on 187 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
