In [1]:
data = []
with open('train.txt') as f:
    for line in f:
        data.append(line)

In [2]:
labelled_data = []
for line in data:
    labelled_data.append(line.split(' ',1))

In [3]:
x = []
y = []
for label,review in labelled_data:
    x.append(review)
    y.append(label)

In [4]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')

In [5]:
import re
def cleanData(sentence):
    sentence = re.sub(r'[^A-Za-z0-9\s.]',r'',str(sentence).lower())
    sentence = re.sub(r'\n',r' ',sentence)
    sentence = re.sub(r'\.',r'',sentence)
    sentence = " ".join([word for word in sentence.split() if word not in stopWords])    
    return sentence

In [6]:
data = []
for sentence in x:
    data.append(cleanData(sentence))

In [7]:
import logging
import matplotlib
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%matplotlib inline

In [8]:
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.layers import Dense, Activation

Using TensorFlow backend.


In [9]:
Y_data = []
for a in y:
    if a == 'negative':
        Y_data.append(0)
    else:
        Y_data.append(1)
y = Y_data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data,y, test_size = 0.5, random_state = 42)

In [11]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1 

In [12]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [13]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1 
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [14]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('./glove.6B/glove.6B.50d.txt',
                                           tokenizer.word_index, embedding_dim)

In [15]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.46066081166607425

In [102]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(LSTM(units=embedding_dim))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 50)           5767150   
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_7 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 5,787,871
Trainable params: 20,721
Non-trainable params: 5,767,150
_________________________________________________________________


In [104]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5, min_delta=0.1)

In [105]:
history = model.fit(X_train, Y_train,
                    epochs=10,
                    verbose=1,
                    validation_data=(X_test, Y_test),
                    batch_size=64,
                    callbacks=[es])

Train on 50000 samples, validate on 50000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 00009: early stopping


In [106]:
prediction = model.predict(X_test)

In [107]:
p=[]
n=[]
for i in range(len(Y_test)):
    if Y_test[i]==1:
        p.append(float(prediction[i]))
    else:
        n.append(float(prediction[i]))
correct_pos = len([i for i in p if i>0.5])
correct_neg = len([i for i in n if i<0.5])

In [108]:
print(len(p))
print(len(n))

25820
24180


In [109]:
print(correct_pos)
print(correct_neg)

21999
20504


In [110]:
text = ['very informative and gives a good foundation of the subject']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

positive 0.977859


In [111]:
accr = model.evaluate(X_test,Y_test)



In [114]:
text = ['this is the worst product i have purchased']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

negative 0.010526


In [115]:
text = ['this can be useful']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

positive 0.892258


In [116]:
text = ['i regret buying this product']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

negative 0.012135


In [120]:
text = ['very poor quality']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

negative 0.020627


In [123]:
text = ['the appearance is good']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

positive 0.623092


In [121]:
model.save('my_model.h5')

In [124]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [128]:
text = ['i am not convinced for buying this product']
score = model.predict(pad_sequences(tokenizer.texts_to_sequences(text), padding='post', maxlen=maxlen))
if score>0.5:
    print("positive %f" %score)
else:
    print("negative %f" %score)

negative 0.107059
