## Import libraries

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf

import re
from tqdm import tqdm

## Load data

In [2]:
dataset = pd.read_csv('dataset/IMDB-dataset.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
(dataset['review'].str.split().str.len() < 400).sum()

43483

## Preprocess data

In [7]:
dataset['sentiment'] = dataset['sentiment'].map({'positive': 1, 'negative': 0})

# Ratio of positive and negative reviews
dataset['sentiment'].value_counts(normalize=True)

Series([], Name: sentiment, dtype: float64)

### Split data

In [8]:
X_train_texts = dataset['review'][:49000]
y_train = dataset['sentiment'][:49000]

X_test_texts = dataset['review'][49000:]
y_test = dataset['sentiment'][49000:]

### Load embeddings

In [9]:
vocab_size = 400001
embedding_dim = 100

word_to_index = {}
index_to_word = {}
word_to_vec_map = {}
with open('embeddings/glove.6B.100d.txt', 'r', encoding='utf-8') as f:    
    for i, line in enumerate(f):
        values = line.split()
        word = values[0]
        
        coefs = np.asarray(values[1:], dtype='float32')
        
        word_to_index[word] = i
        index_to_word[i] = word
        word_to_vec_map[word] = coefs

In [10]:
word_to_index['the'], index_to_word[0], word_to_vec_map['the'].shape

(0, 'the', (100,))

#### Embedding matrix

In [11]:
# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill in the embedding matrix
for word, index in word_to_index.items():
    if word in word_to_vec_map:
        embedding_matrix[index] = word_to_vec_map[word]

### Tokenize

In [12]:
MAX_SENTENCE_LENGTH = 2500

def tokenize(sentences, max_sentence_length=MAX_SENTENCE_LENGTH):
    tokenized_sentences = np.zeros((len(sentences), max_sentence_length))
    
    for i, sentence in enumerate(sentences):
        words = sentence.lower().split()
        
        for j, word in enumerate(words):    
            if j == max_sentence_length:
                break
            
            # Remove punctuation from the word
            word = re.sub(r'[^\w\s]', '', word)
                        
            if word in word_to_index:
                tokenized_sentences[i, j] = word_to_index[word]
            else:
                tokenized_sentences[i, j] = 400000 # Vector for unknown words
        
    return tokenized_sentences

In [13]:
X_train_sequences = tokenize(X_train_texts)
X_test_sequences = tokenize(X_test_texts)

# Model

## Baseline
This model is a baseline for the other models. It is a simple neural network that consists of an embedding layer, a global average pooling layer, and a dense layer with sigmoid activation. **The embedding layer is trained from scratch.** Where the other models will use pre-trained embeddings, because training time would be too long.

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=MAX_SENTENCE_LENGTH),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [11]:
model.fit(X_train_sequences, y_train, epochs=20, batch_size=16, validation_data=(X_test_sequences, y_test), verbose=1)

### Results
- Small training set (1000 examples)
- Training unitl reaches max test accuracy (20 to 60 epochs)
- Batch size of 16
- Adam optimizer

| Model | Accuracy | Validation Accuracy |
|-------|----------|---------------------|
| Baseline | 1.0000 | 0.8260 |
| LSTM 32 | 0.4970 | 0.5320 |
| Bi-LSTM 8 | 0.9300 | 0.7460 |
| 2 x Bi-LSTM 8 | 0.9350 | 0.7360 |
| Bi-LSTM 16 | 0.9840 | 0.7860 |
| Bi-LSTM 32 | 0.9990  | 0.7840 |
| Bi-LSTM 128 | 0.9930 | 0.7320 |
| 2 x Bi-LSTM 128 | 0.9990 | 0.7320 |

### Observations
- Simple baseline model performs surprisingly well although it's overfitting
- LSTM layers with > 32 units introduce overfitting **on a small dataset**. Training on a entire dataset should reduce overfitting. So we will try more units.
- LSTM layers with < 16 units introduce biast - can't learn function well enough
- Bidirectional LSTM layers give much better results
- Adding second layer doesn't improve performance, only increases training time
- Baseline model quickly reaches its best performance - it can't learn more complex function

### Conclusions
- Use single bidirectional LSTM layers with 16 > units
- Train on entire dataset to reduce overfitting

## Simple Bi-LSTM

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=MAX_SENTENCE_LENGTH, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [28]:
model.fit(X_train_sequences, y_train, epochs=10, batch_size=32, validation_data=(X_test_sequences, y_test), verbose=1)

### Evaluatation

In [33]:
train_loss, train_acc = model.evaluate(X_train_sequences, y_train, verbose=0)
test_loss, test_acc = model.evaluate(X_test_sequences, y_test, verbose=0)

print(f'Train accuracy: {train_acc*100:.2f}%')
print(f'Test accuracy: {test_acc*100:.2f}%')

Train accuracy: 95.00%
Train accuracy: 90.30%


### Summary
- Final model got  **95.0%** accuracy on training set and **90.3%** accuracy on a test set.
- Great! By training on entire dataset there is only little overfitting even with 64 units.
- It's very hard to get over 90% test accuracy using this simple architecture. Units above 64 introduce overfitting. Adding more layers and layer normalization doesn't improve performance.
- Models with 64 > units can get 99% training but only 90% test accuracy.
- After 10 epochs there is only training accuracy improvement, test accuracy stays the same or even decreases.

### Prediction

In [28]:
review = [
    "The movie was great. I enjoyed every second of it although actors play wasn't perfect but plot made up for it. I would recommend it to others.",
    "I loved this movie. It was so funny and entertaining.",
    "This movie was terrible. The plot was boring and the acting was awful.",
    "Funny movie but not an amazing one. Best to watch with your friends.",
    "Not a good movie. I wasn't really satisfied with the actors play. I wouldn't recommend it to others.",
    "A good movie. I was satisfied with the actors play. I would recommend it to others."
]

review_sequence = tokenize(review)

prediction = model.predict(review_sequence).flatten() > 0.5

# Map True to Positive and False to Negative
np.where(prediction, "Positive", "Negative").tolist()



['Positive', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive']