<a href="https://colab.research.google.com/github/Vikas-KM/tensorflow-learning/blob/master/sentiment_analysis_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Dense, LSTM, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.initializers import glorot_uniform

import re
from sklearn import model_selection

from tqdm import tqdm

In [2]:
! tar -xvf /content/drive/MyDrive/amazon_review_polarity_csv.tar.gz

amazon_review_polarity_csv/
amazon_review_polarity_csv/test.csv
amazon_review_polarity_csv/train.csv
amazon_review_polarity_csv/readme.txt


In [3]:
with open('/content/amazon_review_polarity_csv/train.csv') as f:
  text = f.readlines()

In [4]:
df = pd.read_csv('/content/amazon_review_polarity_csv/train.csv')
df.head(10)

Unnamed: 0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...
5,1,Buyer beware,"This is a self-published book, and if you want..."
6,2,Glorious story,I loved Whisper of the wicked saints. The stor...
7,2,A FIVE STAR BOOK,I just finished reading Whisper of the Wicked ...
8,2,Whispers of the Wicked Saints,This was a easy to read book that made me want...
9,1,The Worst!,A complete waste of time. Typographical errors...


In [5]:
df.shape

(3599999, 3)

In [6]:
words = []
labels = []

for i in tqdm(text):
  i = i.split()
  labels.append(1) if i[0]=='__label__2' else labels.append(0)
  words.append(' '.join(i[1:]))

100%|██████████| 3600000/3600000 [00:22<00:00, 161182.55it/s]


In [26]:
X_train = pd.DataFrame()

X_train['consumer_review'] = words
X_train['polarity_review'] = labels

X_train.head(10)

Unnamed: 0,consumer_review,polarity_review
0,"even for the non-gamer"",""This sound track was ...",0
1,"best soundtrack ever to anything."",""I'm readin...",0
2,"soundtrack is my favorite music of all time, h...",0
3,"Soundtrack"",""I truly like this soundtrack and ...",0
4,"Pull Your Jaw Off The Floor After Hearing it"",...",0
5,"absolute masterpiece"",""I am quite sure any of ...",0
6,"beware"",""This is a self-published book, and if...",0
7,"story"",""I loved Whisper of the wicked saints. ...",0
8,"FIVE STAR BOOK"",""I just finished reading Whisp...",0
9,"of the Wicked Saints"",""This was a easy to read...",0


#### Here we are taking only 30% of the data, System overload and for faster execution

In [27]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train['consumer_review'], X_train['polarity_review'], test_size=0.85, random_state=42)

In [28]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
               "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
               'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
               'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
               'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
               'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
               'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
               'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
               'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
               'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
               'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
               'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
               't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll',
               'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn',
               "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
               'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",
               'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won',
               "won't", 'wouldn', "wouldn't"]

In [29]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can't", "can not", phrase)

    # in the above line both will work
    # phrase = re.sub(r"won't", "will not", phrase) and phrase = re.sub(r"won\'t", "will not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [32]:
def data_clean(in_text):
  in_text = in_text.lower()
  in_text = decontracted(in_text)
  out_text = re.sub('[^a-zA-Z]+',' ', in_text) # remove punctuations
  out_text = re.sub(r'\s+[a-zA-Z]\s+',' ', out_text) # remove single characters
  out_text = ' '.join(e.lower() for e in out_text.split() if e not in stop_words)  
  return out_text.strip()

In [33]:
sentences = []
for reviews in tqdm(list(X_train)):
  sentences.append(data_clean(reviews))

100%|██████████| 540000/540000 [01:23<00:00, 6474.05it/s]


In [34]:
X = pd.DataFrame()
X['consumer_reviews'] = sentences
X['polarity_label'] = list(y_train)
X.head()

Unnamed: 0,consumer_reviews,polarity_label
0,done homework quest chess mastery purchased bo...,0
1,found book fairly ordinary liked stories seeme...,0
2,inferior product used chaise occasions already...,0
3,not support purchased ce compass current top o...,0
4,compared legos lego favorite toymaker really c...,0


In [35]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X['consumer_reviews'], X['polarity_label'], test_size=0.3, random_state=42)

In [36]:
type(X_train)

pandas.core.series.Series

In [37]:
# convert to array
X_train = np.array(X_train.values.tolist())
X_test = np.array(X_test.values.tolist())
y_train = np.array(y_train.values.tolist())
y_test = np.array(y_test.values.tolist())

In [38]:
# Apply Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
total_size = len(word_index)+1
print(total_size)

229661


In [39]:
print(word_index)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [40]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [41]:
X_train = pad_sequences(X_train, maxlen=100, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=100, padding='post', truncating='post')

In [42]:
# create Model
model = tf.keras.models.Sequential(
    [
     tf.keras.layers.Embedding(total_size, 20, input_length=100),
     tf.keras.layers.LSTM(32, dropout=0.2, recurrent_dropout=0.2),
     tf.keras.layers.Dense(1, activation='sigmoid')
    ]
)



In [43]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01),
    loss = tf.keras.losses.binary_crossentropy,
    metrics = ['accuracy']
)

In [44]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 20)           4593220   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                6784      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 4,600,037
Trainable params: 4,600,037
Non-trainable params: 0
_________________________________________________________________
None


In [45]:
class MyCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epochs, logs={}):
    if(logs.get('accuracy')> 0.9):
      print('Achieved  DESIRED ACCURACY')
      self.model.stop_training = True

callbacks = MyCallback() 

In [48]:
model.fit(
    X_train,
    y_train,
    batch_size = 128,
    epochs = 5,
    verbose = 1,
    validation_data = (X_test, y_test),
    callbacks = [callbacks]
)

Epoch 1/5
Achieved  DESIRED ACCURACY


<tensorflow.python.keras.callbacks.History at 0x7f9fcd46d4a8>

In [49]:
# save the model
model.save('model.h5')

In [50]:
model = tf.keras.models.load_model('model.h5')



In [51]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 20)           4593220   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                6784      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 4,600,037
Trainable params: 4,600,037
Non-trainable params: 0
_________________________________________________________________
None
