In [1]:
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from string import punctuation
import re
import tensorflow as tf
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Flatten, Bidirectional

In [2]:
df=pd.read_csv('./IMDB_dataset/IMDB dataset.csv')
stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

def remove_stopwords(text, stopwords_set):
    output = []
    for i in text.split():
        word = i.strip().lower()
        if word not in stopwords_set and word.isalpha():
            output.append(word)
    return " ".join(output)
    
def process_data(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = remove_stopwords(text, stop)
    return text

df['review']=df['review'].apply(process_data)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

train, test= train_test_split(df, test_size=0.2, random_state=42)

x_train, y_train = train['review'], train['sentiment']
x_test, y_test = test['review'], test['sentiment']
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,
                                             test_size=0.2,random_state=10)

  soup = BeautifulSoup(text, "html.parser")


In [3]:
tokenizer = Tokenizer(oov_token="OOV")

In [4]:
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

In [5]:
sequences = tokenizer.texts_to_sequences(x_train)

import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
max_len = 225
trunc_type = 'post'
oov_tok = 'OOV'
padding_type = 'post'
vocab_size = len(word_index) + 1

In [7]:
train_padded = pad_sequences(sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)

test_sentences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sentences, maxlen=max_len, padding=padding_type,truncating=trunc_type)

val_sentences = tokenizer.texts_to_sequences(x_val)
val_padded = pad_sequences(val_sentences, maxlen=max_len, padding=padding_type,truncating=trunc_type)

In [8]:
embedding_dim = 200

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))

model.add(Dense(1, activation='sigmoid'))

In [9]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 225, 200)          13985200  
                                                                 
 dropout (Dropout)           (None, 225, 200)          0         
                                                                 
 lstm (LSTM)                 (None, 225, 64)           67840     
                                                                 
 dense (Dense)               (None, 225, 1)            65        
                                                                 
Total params: 14,053,105
Trainable params: 14,053,105
Non-trainable params: 0
_________________________________________________________________


In [11]:
num_epochs = 4
#batch_size = 8
history = model.fit(train_padded, y_train, epochs=num_epochs,validation_data=(val_padded, y_val))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [12]:
model.evaluate(test_padded, y_test)



[0.5464154481887817, 0.7822799682617188]

In [9]:
embedding_dim = 600
batch_size = 16

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
num_epochs = 4
history = model.fit(train_padded, y_train, epochs=num_epochs,batch_size=batch_size, validation_data=(val_padded, y_val))


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 225, 600)          41955600  
                                                                 
 dropout_1 (Dropout)         (None, 225, 600)          0         
                                                                 
 lstm_1 (LSTM)               (None, 225, 64)           170240    
                                                                 
 flatten_1 (Flatten)         (None, 14400)             0         
                                                                 
 dense_1 (Dense)             (None, 1)                 14401     
                                                                 
Total params: 42,140,241
Trainable params: 42,140,241
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [10]:
model.evaluate(test_padded, y_test)



[0.7856318950653076, 0.853600025177002]

In [8]:
embedding_dim = 500
batch_size = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
num_epochs = 4
history = model.fit(train_padded, y_train, epochs=num_epochs,batch_size=batch_size, validation_data=(val_padded, y_val))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 225, 500)          34963000  
                                                                 
 dropout (Dropout)           (None, 225, 500)          0         
                                                                 
 lstm (LSTM)                 (None, 225, 128)          322048    
                                                                 
 flatten (Flatten)           (None, 28800)             0         
                                                                 
 dense (Dense)               (None, 1)                 28801     
                                                                 
Total params: 35,313,849
Trainable params: 35,313,849
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [9]:
model.evaluate(test_padded, y_test)



[0.8662416934967041, 0.8531000018119812]

In [10]:
embedding_dim = 500
batch_size = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
num_epochs = 4
history = model.fit(train_padded, y_train, epochs=num_epochs,batch_size=batch_size, validation_data=(val_padded, y_val))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 225, 500)          34963000  
                                                                 
 dropout_1 (Dropout)         (None, 225, 500)          0         
                                                                 
 lstm_1 (LSTM)               (None, 225, 256)          775168    
                                                                 
 flatten_1 (Flatten)         (None, 57600)             0         
                                                                 
 dense_1 (Dense)             (None, 1)                 57601     
                                                                 
Total params: 35,795,769
Trainable params: 35,795,769
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [11]:
model.evaluate(test_padded, y_test)



[0.8571547865867615, 0.8432999849319458]

In [8]:
embedding_dim = 800
batch_size = 8

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
num_epochs = 4
history = model.fit(train_padded, y_train, epochs=num_epochs,batch_size=batch_size, validation_data=(val_padded, y_val))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 225, 800)          55940800  
                                                                 
 dropout (Dropout)           (None, 225, 800)          0         
                                                                 
 lstm (LSTM)                 (None, 225, 64)           221440    
                                                                 
 flatten (Flatten)           (None, 14400)             0         
                                                                 
 dense (Dense)               (None, 1)                 14401     
                                                                 
Total params: 56,176,641
Trainable params: 56,176,641
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [9]:
model.evaluate(test_padded, y_test)



[0.9377745389938354, 0.8429999947547913]

In [9]:
embedding_dim = 800
batch_size = 32
num_epochs = 1


model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
history = model.fit(train_padded, y_train, epochs=num_epochs,batch_size=batch_size, validation_data=(val_padded, y_val))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 225, 800)          55940800  
                                                                 
 dropout (Dropout)           (None, 225, 800)          0         
                                                                 
 lstm (LSTM)                 (None, 225, 64)           221440    
                                                                 
 flatten (Flatten)           (None, 14400)             0         
                                                                 
 dense (Dense)               (None, 1)                 14401     
                                                                 
Total params: 56,176,641
Trainable params: 56,176,641
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.evaluate(test_padded, y_test)



[0.30613943934440613, 0.8664000034332275]

In [12]:
model.save('LSTM_86.6.h5')

In [8]:
embedding_dim = 800
batch_size = 32
num_epochs = 1


model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 225, 800)          55940800  
                                                                 
 dropout (Dropout)           (None, 225, 800)          0         
                                                                 
 lstm (LSTM)                 (None, 225, 64)           221440    
                                                                 
 flatten (Flatten)           (None, 14400)             0         
                                                                 
 dense (Dense)               (None, 2)                 28802     
                                                                 
Total params: 56,191,042
Trainable params: 56,191,042
Non-trainable params: 0
_________________________________________________________________


In [9]:
from keras.utils import to_categorical

# Convert target values to one-hot encoding
y_train_encoded = to_categorical(y_train, num_classes=2)
y_val_encoded = to_categorical(y_val, num_classes=2)

# Update the model.fit() call with the new target values
history = model.fit(train_padded, y_train_encoded, epochs=num_epochs, batch_size=batch_size, validation_data=(val_padded, y_val_encoded))



In [11]:
y_test_encoded = to_categorical(y_test, num_classes=2)

model.evaluate(test_padded, y_test_encoded)



[0.3088577389717102, 0.8730000257492065]

In [31]:
print(test_padded.shape)
print(test_padded[0].shape)


(10000, 225)
(225,)


In [41]:
max_len = 225

test_input = np.reshape(train_padded[3], (1, max_len))

In [39]:
output = model(test_input)

In [40]:
print(output)

tf.Tensor([[0.02311346 0.9768866 ]], shape=(1, 2), dtype=float32)


In [42]:
predictions = np.argmax(output, axis=1)

In [44]:
print(predictions)

[1]


In [47]:
model.save('LSTM_87.3.h5')

In [67]:
review = "Interesting movie."
review = tokenizer.texts_to_sequences([review])
review = pad_sequences(review, maxlen=max_len, padding=padding_type, truncating=trunc_type)
review

array([[119,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [68]:
review = np.reshape(review, (1, max_len))
review

array([[119,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [69]:
output = model(review)
print(output)
predictions = np.argmax(output, axis=1)
print(predictions)

tf.Tensor([[0.4734102 0.5265899]], shape=(1, 2), dtype=float32)
[1]
