In [1]:
data = []
with open('train.ft.txt') as f:
    for line in f:
        data.append(line)

In [2]:
labelled_data = []
for line in data:
    labelled_data.append(line.split(' ',1))

In [3]:
x = []
y = []
for label,review in labelled_data:
    x.append(review)
    y.append(label)

In [4]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')

In [5]:
import re
def cleanData(sentence):
    sentence = re.sub(r'[^A-Za-z0-9\s.]',r'',str(sentence).lower())
    sentence = re.sub(r'\n',r' ',sentence)
    sentence = re.sub(r'\.',r'',sentence)
    sentence = " ".join([word for word in sentence.split() if word not in stopWords])    
    return sentence

In [6]:
data = []
for sentence in x:
    data.append(cleanData(sentence))

In [7]:
import logging
import matplotlib
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%matplotlib inline

In [8]:
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.layers import Dense, Activation

Using TensorFlow backend.


In [9]:
Y_data = []
for a in y:
    if a == 'negative':
        Y_data.append(0)
    else:
        Y_data.append(1)
y = Y_data

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data,y, test_size = 0.5, random_state = 42)

In [32]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1 

In [33]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [34]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1 
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [35]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('./glove.6B/glove.6B.50d.txt',
                                           tokenizer.word_index, embedding_dim)

In [36]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.11402573894802728

In [37]:
from keras.layers.core import Dropout
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(LSTM(units=embedding_dim))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 50)           70935300  
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 70,956,021
Trainable params: 20,721
Non-trainable params: 70,935,300
_________________________________________________________________


In [38]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=1, min_delta=0.0001)

In [39]:
history = model.fit(X_train, Y_train,
                    epochs=100,
                    verbose=1,
                    validation_split=0.5,
                    batch_size=64,
                    callbacks=[es])

Train on 900000 samples, validate on 900000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping


In [40]:
text = ['i am not convinced for buying this product']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

negative 0.056718


In [42]:
text = ['this is the best product in the market']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

positive 0.785539


In [43]:
text = ['this movie is rubbish']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

negative 0.004176


In [44]:
text = ['productive 3 hours']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

positive 0.570631


In [48]:
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model1.add(LSTM(units=embedding_dim))
model1.add(Dropout(0.2))
model1.add(Dense(25, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 50)           70935300  
_________________________________________________________________
lstm_6 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 26        
Total params: 70,956,801
Trainable params: 21,501
Non-trainable params: 70,935,300
_________________________________________________________________


In [49]:
es1 = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=1, min_delta=0.00001)

In [50]:
history1 = model1.fit(X_train, Y_train,
                    epochs=100,
                    verbose=1,
                    validation_split=0.5,
                    batch_size=64,
                    callbacks=[es1])

Train on 900000 samples, validate on 900000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping


In [None]:
text = ['i am not convinced for buying this product']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.7:
    print("positive %f" %(score)*100)
else:
    if score>0.4:
        print("neutral")
    else:
        print("positive %f" %(1-score))