In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re
import pickle
from sklearn.model_selection import train_test_split

In [None]:
#Load normalized data


with open('/content/drive/MyDrive/train_texts', 'rb') as f:
  train_texts = pickle.load(f)

with open('/content/drive/MyDrive/train_labels', 'rb') as f:
  train_labels = pickle.load(f)

with open('/content/drive/MyDrive/val_texts', 'rb') as f:
  val_texts = pickle.load(f)

with open('/content/drive/MyDrive/val_labels', 'rb') as f:
  val_labels = pickle.load(f)


In [None]:
#Define parameters

vocab_size = 50000
oov_tok = "<OOV>"


In [None]:
#Load tokenizer

with open('/content/drive/MyDrive/tokenizer50k.pickle', 'rb') as handle:
  tokenizer = pickle.load(handle)

In [None]:
'''
word_index = tokenizer.word_index
vocab_size=len(word_index)
vocab_size
'''

789951

In [None]:
'''
word_index = tokenizer.word_index
vocab_size=len(word_index)
vocab_size+=1
vocab_size
[list(word_index)[-1]]
'''

789952

In [None]:
[list(tokenizer.word_index)[0]]

['<OOV>']

In [None]:
tokenizer.texts_to_sequences(['omkarsawant'])

[[1]]

In [None]:
#Tokenize train and val sentences

train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)

In [None]:
#Define parameters

trunc_type='post'
max_length = max(len(train_ex) for train_ex in train_texts)
max_length

257

In [None]:
#Padd train and val sentences to get consistent length

train_texts = pad_sequences(train_texts, maxlen=max_length, truncating=trunc_type)
val_texts = pad_sequences(val_texts, maxlen=max_length, truncating=trunc_type)

In [None]:
#Define parameters

embedding_dim = 50
print(max_length,vocab_size, embedding_dim)


#Define model

lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
lstm_model.summary()


257 50000 50
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 257, 50)           2500000   
_________________________________________________________________
dropout_6 (Dropout)          (None, 257, 50)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 2,591,777
Trainable params: 2,591,777
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Fit model

history = lstm_model.fit(train_texts, 
                    train_labels, 
                    batch_size=128,
                    epochs=1,
                    validation_data=(val_texts, val_labels))



In [None]:
#Save model to drive

lstm_model.save('/content/drive/MyDrive/lstm_model.hdf5')

In [None]:
#Test a user defined sentence

sample = ['vary gud']
sample = tokenizer.texts_to_sequences(sample)
sample = pad_sequences(sample, maxlen=max_length, truncating=trunc_type)

lstm_model.predict(sample)

array([[0.62229043]], dtype=float32)

In [None]:
#Evaluate on test data

with open('/content/drive/MyDrive/test_texts', 'rb') as f:
  test_texts = pickle.load(f)

with open('/content/drive/MyDrive/test_labels', 'rb') as f:
  test_labels = pickle.load(f)

In [None]:
#Tokenize test sentences

test_texts = tokenizer.texts_to_sequences(test_texts)

In [None]:
#Padd test sentences to get consistent length

test_texts = pad_sequences(test_texts, maxlen=max_length, truncating=trunc_type)

In [None]:
#Evaluate

lstm_model.evaluate(test_texts, test_labels, batch_size=128)



[0.1372213065624237, 0.9493324756622314]

In [None]:
test_predictions = lstm_model.predict(test_texts)

In [None]:
test_predictions = [x[0] for x in test_predictions]

In [None]:
test_predictions = [0 if result<0.5 else 1 for result in test_predictions ]
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_labels, test_predictions)
print(cm)

[[189737  10263]
 [ 10004 189996]]


In [None]:
from sklearn.metrics import precision_score
precision_score(test_labels, test_predictions, average='binary')

0.9487513669797613

In [None]:
from sklearn.metrics import recall_score
recall_score(test_labels, test_predictions, average='binary')

0.94998

In [None]:
from sklearn.metrics import f1_score
f1_score(test_labels, test_predictions)

0.9493652859773296