In [2]:
import pandas as pd

df = pd.read_csv("labeledTrainData.tsv",sep='\t', quoting=3)

print(df.shape)

df.head()

(25000, 3)


Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
# Pre processing 
import re

def clean_review(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)
    
    # Strip escaped quotes
    text = text.replace('\\"', '')
    
    # Strip quotes
    text = text.replace('"', '')
    
    text = text.lower()
    
    return text

In [4]:
df['cleaned_review']= df['review'].apply(clean_review)

In [5]:
df.head()

Unnamed: 0,id,sentiment,review,cleaned_review
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",with all this stuff going down at the moment w...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",the classic war of the worlds by timothy hines...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",the film starts with a manager (nicholas bell)...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",it must be assumed that those who praised this...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbly trashy and wondrously unpretentious 8...


In [6]:
df.isnull().sum()

id                0
sentiment         0
review            0
cleaned_review    0
dtype: int64

In [9]:
import numpy as np
import matplotlib.pyplot as plt

df['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [10]:
df['cleaned_review'].values[0]

"with all this stuff going down at the moment with mj i've started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again. maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. some of it has subtle messages about mj's feeling towards the press and also the obvious message of drugs are bad m'kay.  visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring. some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.  the actual feature film bit when it finally starts is only on for 2

In [11]:
sentences = df['cleaned_review'].values
y = df['sentiment'].values

In [12]:
from sklearn.model_selection import train_test_split

sentences_train,sentences_test,y_train,y_test = train_test_split(sentences, y,test_size=0.25,random_state=1000)

In [13]:
print(sentences_train.shape)
print(sentences_test.shape)

(18750,)
(6250,)


In [14]:
y_train.shape

(18750,)

In [16]:
# Word Embeding using : Keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of  reserved 0 index
vocab_size = len(tokenizer.word_index) + 1  

maxlen = 100

X_train_padding = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test_padding = pad_sequences(X_test, padding='post', maxlen=maxlen)

X_train_padding

array([[   2, 4346,   55, ...,  900, 1791,    8],
       [  67, 2121,    1, ...,    8,  121,  124],
       [  98,  613,    4, ...,   28,    1,  130],
       ...,
       [ 595,   60,    4, ...,    0,    0,    0],
       [ 259,    1,  818, ...,  702,  458,  160],
       [   5, 1471,   33, ..., 2811,    5,  560]])

In [17]:
# Actual input text
print(sentences_train[:2])

["what noise fails to do is get us to understand its character. tim robbins plays an obsessive new yorker who can't deal with the obtrusive noises of the city any longer, particularly car alarms. it's an odd idea for a film, which has about as much creative credibility as death wish. it is clever at points; particularly a scene in which our hero is trying to read through hagel, i'm too stupid to be understanding this. he reads and rereads a paragraph in confusion, we read it and don't get it either.just then a car alarm goes off. throughout the movie is constant interference of alarms and city noises. though, all in all it does little to help us understand our hero, who allows this all to ruin his marriage and gets distracted with side plots instead of digging deeper-into his persona.   the film-making itself is too oblivious to notice its own sound problems, shoddy editing, and visible boom mikes. no, noise isn't all-bad. william hurt is at least colorful. at least the ending doesn't 

In [18]:
# convered to sequence of text
print(X_train[:2])

[[47, 3336, 991, 5, 76, 6, 74, 175, 5, 380, 91, 105, 1662, 287, 31, 154, 35, 187, 882, 14, 1, 4, 1, 524, 98, 1219, 551, 490, 42, 31, 1014, 325, 15, 3, 18, 61, 44, 40, 13, 72, 1502, 2963, 13, 346, 633, 8, 6, 1091, 29, 767, 551, 3, 133, 7, 61, 262, 620, 6, 266, 5, 329, 139, 143, 94, 369, 5, 26, 1981, 10, 25, 4683, 2, 3, 7, 2964, 70, 329, 8, 2, 88, 74, 8, 347, 39, 90, 3, 490, 267, 121, 468, 1, 16, 6, 1768, 4, 2, 524, 148, 28, 7, 28, 8, 123, 113, 5, 340, 175, 380, 262, 620, 35, 2048, 10, 28, 5, 2570, 22, 1344, 2, 217, 14, 495, 1894, 304, 4, 2718, 80, 22, 3337, 1, 18, 226, 409, 6, 94, 5, 1462, 91, 202, 473, 701, 778, 2, 4346, 55, 3336, 215, 28, 75, 1056, 1470, 6, 29, 221, 3229, 29, 221, 1, 279, 150, 813, 1008, 448, 8, 2979, 337, 3, 211, 27, 21, 750, 237, 194, 4, 29, 221, 9, 1805, 148, 28, 7, 28, 234, 193, 8, 426, 24, 1067, 1, 60, 3273, 19, 2, 457, 5, 26, 407, 13, 501, 13, 8, 634, 10, 6, 155, 27, 4, 144, 97, 11, 21, 235, 63, 29, 3, 18, 1382, 17, 237, 506, 74, 1597, 52, 15, 4684, 783, 8, 41, 

In [19]:
# post padding
print(X_train_padding[:2])

[[   2 4346   55 3336  215   28   75 1056 1470    6   29  221 3229   29
   221    1  279  150  813 1008  448    8 2979  337    3  211   27   21
   750  237  194    4   29  221    9 1805  148   28    7   28  234  193
     8  426   24 1067    1   60 3273   19    2  457    5   26  407   13
   501   13    8  634   10    6  155   27    4  144   97   11   21  235
    63   29    3   18 1382   17  237  506   74 1597   52   15 4684  783
     8   41   19  278   43  330   62    5  291  565    7    1 1140  900
  1791    8]
 [  67 2121    1  114   12  642 1110  258   34   35   62  285   22  650
     7   10   16    2    1  555  135   69   15    3  359  349   16   51
    67  216    9  100   10   16  963    3  313   71    8   44 1990    8
     6    3   16   61   66   26  106   30    1  219  225  272   20    1
     4  367  231    8 1401   45  240 2541  135    3   16   40  115    2
     1  701   11   66 3804   50   21  137  419  127  225  416    1   16
     6   51   72   36    8   36   17    9  161  100

In [20]:
# Word Embeddings: using pre trained (Glove)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_dim = 50

print("Vocabulary Size:",vocab_size)

word_index['hello']

Vocabulary Size: 78089


4874

In [21]:
# Create Embedding matric of shape (78089,50) 78089 : samples and 50: Dimention
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [22]:
embedding_matrix.shape

(78089, 50)

In [23]:
embedding_matrix[:10]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [25]:
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    word, *vector = line.split()
    if word in word_index:
        idx = word_index[word]
        embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

In [26]:
embedding_matrix.shape

(78089, 50)

In [27]:
embedding_matrix[:10]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, -0.39961001,  0.083172  ,
         0.043953  , -0.39140999,  0.3344    , -0.57545   ,  0.087459  ,
         0.28786999, -0.06731   ,  0.30906001, -0.26383999, -0.13231   ,
        -0.20757   ,  0.33395001, -0.33848   , -0.

In [24]:
embedding_matrix[:10].shape

(10, 50)