# IMDB Sentiment Analysis

### Load Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Embedding, Bidirectional

In [None]:
df = pd.read_csv('IMDB Dataset.csv')
print(df.shape)
df.head()

In [None]:
df.describe()

In [None]:
df['sentiment'].value_counts()

### Preprocess

In [None]:
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
df['review'] = [REPLACE_NO_SPACE.sub("", line.lower()) for line in df['review']]
df['review'] = [REPLACE_WITH_SPACE.sub(" ", line) for line in df['review']]

In [None]:
label = LabelEncoder()
df['sentiment'] = label.fit_transform(df['sentiment'])

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 40000 

training_sentences = df['review'][:training_size]
training_labels = df['sentiment'][:training_size]
testing_sentences = df['review'][training_size:]
testing_labels = df['sentiment'][training_size:]

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

X_train = np.array(training_padded)
y_train = np.array(training_labels)
X_test = np.array(testing_padded)
y_test = np.array(testing_labels)

### Model Training + Evaluation

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(X_train,  y_train, validation_data=(X_test, y_test), epochs=15)

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc)) 

plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.figure()
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.title('Training and validation loss')

In [None]:
text = 'XXX'
tokenized_text = tokenizer.texts_to_sequences(text)
tokenized_text = pad_sequences(tokenized_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)
model.predict(tokenized_text)

In [None]:
'''
Inspiration
1. https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184
2. https://www.coursera.org/learn/natural-language-processing-tensorflow?specialization=tensorflow-in-practice
'''