# Problem 1 - Sentiment Analysis using recurrent models

## 1.1

In [None]:
import pandas as pd
import numpy as np
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from tensorflow.keras.utils import to_categorical

In [None]:
df = pd.read_csv("IMDB Dataset.csv", usecols=["review", "sentiment"], encoding='latin-1')
## 1 - positive, 0 - negative
df.sentiment = (df.sentiment == "positive").astype("int")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
val_size = int(df.shape[0] * 0.15)
test_size = int(df.shape[0] * 0.15)


def train_val_test_split(df=None, train_percent=0.7, test_percent=0.15, val_percent=0.15):
  df = df.sample(frac=1)
  train_df = df[: int(len(df)*train_percent)]
  test_df = df[int(len(df)*train_percent)+1 : int(len(df)*(train_percent+test_percent))]
  val_df = df[int(len(df)*(train_percent + test_percent))+1 : ]
  return train_df, test_df, val_df

train_df, test_df, val_df = train_val_test_split(df, 0.7, 0.15, 0.15)
train_labels, train_texts = train_df.values[:,1], train_df.values[:,0]
val_labels, val_texts = val_df.values[:,1], val_df.values[:,0]
test_labels, test_texts = test_df.values[:,1], test_df.values[:,0]
print(len(train_df), len(test_df), len(val_df))
print(len(train_texts), len(train_labels), len(val_df))

35000 7499 7499
35000 35000 7499


In [None]:
train_texts[:5]

array(["i'm not going to ramble on about it but i'm just going to make it brief. basically for those who don't know how prue actually died........... the first time round the demonic assassin comes hit piper and prue with an energy ball they fly through the wall blood everywhere. phoebe the third sister comes down the stairs, says the spell which send him away but not vanquished.(NEEDS THE POWER OF THREE)leo comes heals them both and so on. they get exposed along the line and the only way the can be saved is for a demon named tempus to turn back time. the only way he can do that is is phoebe stays in the underworld. she agrees, tempus turns back time. it now around 7:00 in the morning again. demon comes strucks piper and prue with energy ball. they fly through wall again. but this time phoebe isn't there to say the spell to fend demon off. demon kills doctor. doctor flies through window. he is dead. demon goes in a whirl wing type thing and glass on the doors shatter which is a great e

### Tokenization

In [None]:
def process_tokens(text):
    """
    function to process tokens, replace any unwanted chars
    """
    preprocessed_text = text.lower().replace(",", "").replace(".", "").replace(":", "").replace(")", "").replace("-", "").replace("(", "")
    preprocessed_text = ''.join([i for i in preprocessed_text if not preprocessed_text.isdigit()])
    return preprocessed_text

def preprocessing(data):
    """
    preprocessing data to list of tokens
    """
    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    preprocessed_data = []
    for sentence in data:
        sentence = process_tokens(sentence)
        tokens = tokenizer(sentence)
        tlist = []
        for token in tokens:
            tlist.append(str(token))
        preprocessed_data.append(tlist)
    return preprocessed_data

train_data = preprocessing(train_texts)
val_data = preprocessing(val_texts)
test_data = preprocessing(test_texts)

In [None]:
print(train_data[0])

["i'm", 'not', 'going', 'to', 'ramble', 'on', 'about', 'it', 'but', "i'm", 'just', 'going', 'to', 'make', 'it', 'brief', 'basically', 'for', 'those', 'who', "don't", 'know', 'how', 'prue', 'actually', 'died', 'the', 'first', 'time', 'round', 'the', 'demonic', 'assassin', 'comes', 'hit', 'piper', 'and', 'prue', 'with', 'an', 'energy', 'ball', 'they', 'fly', 'through', 'the', 'wall', 'blood', 'everywhere', 'phoebe', 'the', 'third', 'sister', 'comes', 'down', 'the', 'stairs', 'says', 'the', 'spell', 'which', 'send', 'him', 'away', 'but', 'not', 'vanquishedneeds', 'the', 'power', 'of', 'threeleo', 'comes', 'heals', 'them', 'both', 'and', 'so', 'on', 'they', 'get', 'exposed', 'along', 'the', 'line', 'and', 'the', 'only', 'way', 'the', 'can', 'be', 'saved', 'is', 'for', 'a', 'demon', 'named', 'tempus', 'to', 'turn', 'back', 'time', 'the', 'only', 'way', 'he', 'can', 'do', 'that', 'is', 'is', 'phoebe', 'stays', 'in', 'the', 'underworld', 'she', 'agrees', 'tempus', 'turns', 'back', 'time', 'it

### Bag of Words

In [None]:
import numpy as np
import itertools

## Creating a vectorizer to vectorize text and create matrix of features
## Bag of words technique
class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        word_dict = {}
        for sentence in dataset:
            for token in sentence:
                if token not in word_dict:
                    word_dict[token] = 1
                else:
                    word_dict[token] += 1
        word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))
        end_to_slice = min(len(word_dict), self.max_features)
        word_dict = dict(itertools.islice(word_dict.items(), end_to_slice))
        self.vocab_list = list(word_dict.keys())
        self.token_to_index = {}
        counter = 0
        for token in self.vocab_list:
            self.token_to_index[token] = counter
            counter += 1


    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        for i, sentence in enumerate(dataset):
            for token in sentence:
                if token in self.token_to_index:
                    data_matrix[i, self.token_to_index[token]] += 1
        return data_matrix

## max features - top k words to consider only
max_features = 2000

vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)

## Checking if the len of vocab = k
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

vocab = vectorizer.vocab_list

In [None]:
## each sequence of token is a vector of
## token indices (with the count of those words)
X_train[:5]

array([[19.,  9.,  5., ...,  0.,  0.,  0.],
       [38., 15., 15., ...,  0.,  0.,  0.],
       [ 5.,  3.,  3., ...,  0.,  0.,  0.],
       [31.,  8.,  5., ...,  0.,  0.,  0.],
       [ 1.,  3.,  4., ...,  0.,  0.,  0.]])

In [None]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

In [None]:
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_val = to_categorical(y_val, 2)

In [None]:
X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (35000, 1, 2000), y_train.shape: (35000, 2)


## 1.2

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
model = None
model = Sequential()
model.add(SimpleRNN(256, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 256)               577792    
                                                                 
 dense_4 (Dense)             (None, 2)                 514       
                                                                 
Total params: 578306 (2.21 MB)
Trainable params: 578306 (2.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [None]:
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.556309163570404
Test accuracy: 0.868382453918457


We can see that through training epochs for RNN model, our training loss decreases significantly, almost reaching 0, but our validation loss increases which most likely means that the model is most likely overfitting. Further, the training accuracy significantly increases to up to almost 100% while the validation accuracy does not increase through training which again indicates there could be overfitting. Finally, we can see that our test loss is about 0.56 and test accuracy is about 0.87 which is relatively high.

## 1.3

In [None]:
from tensorflow.keras.layers import LSTM

In [None]:
model = None
model = Sequential()
model.add(LSTM(256, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 256)               2311168   
                                                                 
 dense_5 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2311682 (8.82 MB)
Trainable params: 2311682 (8.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [None]:
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.5641476511955261
Test accuracy: 0.8642485737800598


We can see that through training epochs for LSTM model, our training loss decreases significantly, almost reaching 0, but our validation loss increases which most likely means that the model is most likely overfitting. Further, the training accuracy significantly increases to up to almost 100% while the validation accuracy actually decreases by 1 percentage point which again indicates there could be overfitting. Finally, we can see that our test loss is about 0.56 and test accuracy is about 0.86 which is relatively high and very similar to that of RNN model.

## 1.4

In [None]:
# getting data ready
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

In [None]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

In [None]:
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_val = to_categorical(y_val, 2)

In [None]:
X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (35000, 1, 2000), y_train.shape: (35000, 2)


In [None]:
from tensorflow.keras.layers import GRU

model = None
model = Sequential()
model.add(GRU(256, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_1 (GRU)                 (None, 256)               1734144   
                                                                 
 dense_6 (Dense)             (None, 2)                 514       
                                                                 
Total params: 1734658 (6.62 MB)
Trainable params: 1734658 (6.62 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 0.6334048509597778
Test accuracy: 0.8703827261924744


We can see that through training epochs for GRU model, our training loss decreases significantly, almost reaching 0, but our validation loss increases which most likely means that the model is most likely overfitting. Further, the training accuracy significantly increases to up to almost 100% while the validation accuracy actually decreases slightly (only around 0.04) which again indicates there could be overfitting. Finally, we can see that our test loss is about 0.63 and test accuracy is about 0.87 which is relatively high and slighty higher (both loss and acuracy are slightly higher) than in our previous 2 models.

In [None]:
# check predictions
from tensorflow.keras.backend import argmax

y_pred = model.predict(X_test)
for i in range(5):
  print(f'Label predicted: {argmax(y_pred[i]).numpy()}, Actual label: {argmax(y_test[i]).numpy()}')
  print(f'text: {test_texts[i]}')

Label predicted: 0, Actual label: 0
text: Its a truly awful movie with a laughable storyline.some awful acting.and a script that Ed Wood might be ashamed of.Wagner is laughable in this. He plays his role like number two in Austin Powers.Easily the worst of the Airport movies.1 out of 10
Label predicted: 1, Actual label: 0
text: The fight scenes were great. Loved the old and newer cylons and how they painted the ones on their side. It was the ending that I hated. I was disappointed that it was earth but 150k years back. But to travel all that way just to start over? Are you kidding me? 38k people that fought for their very existence and once they get to paradise, they abandon technology? No way. Sure they were eating paper and rationing food, but that is over. They can live like humans again. They only have one good doctor. What are they going to do when someone has a tooth ache never mind giving birth... yea right. No one would have made that choice.
Label predicted: 0, Actual label: 0

## 1.5

In [None]:
from tensorflow.keras.layers import Bidirectional

# Part 5: Define a BiLSTM model and train it on the dataset

model = None
model = Sequential()

# Add a Bidirectional LSTM layer
model.add(Bidirectional(LSTM(256), input_shape=(1, max_features)))

# Add the output layer
model.add(Dense(2, activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Print model summary
print(model.summary())

# Train the model
history = model.fit(X_train, y_train, batch_size=256,
                    validation_data=(X_val, y_val), epochs=10)

# Print history keys to verify training
print(history.history.keys())

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_3 (Bidirecti  (None, 256)               2180096   
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2180610 (8.32 MB)
Trainable params: 2180610 (8.32 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [None]:
# Evaluate the model
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.5914520621299744
Test accuracy: 0.8702493906021118


We can see that through training epochs for GRU model, our training loss decreases significantly, almost reaching 0, but our validation loss increases which most likely means that the model is most likely overfitting. Further, the training accuracy significantly increases to up to almost 100% while the validation accuracy actually decreases slightly (only around 0.05) which again indicates there could be overfitting. Finally, we can see that our test loss is about 0.59 and test accuracy is about 0.87 which is relatively high and slighty higher (both loss and acuracy are slightly higher) than in our first 2 models, RNN and LSTM and very similar to our previous model (GRU).

## 1.6

**Model 1, RNN model:**  <br>
Test loss: 0.556309163570404 <br>
Test accuracy: 0.868382453918457
<br><br>
**Model 2, LSTM model:**<br>
Test loss: 0.5641476511955261 <br>
Test accuracy: 0.8642485737800598
<br><br>
**Model 3, GRU model:**<br>
Test loss: 0.6334048509597778<br>
Test accuracy: 0.8703827261924744
<br><br>
**Model 4, BiLSTM model:** <br>
Test loss: 0.5914520621299744 <br>
Test accuracy: 0.8702493906021118

Looking solely at accuracy we can see that the GRU model has the highest accuracy with a value of 0.8704. The BiLSTM model is very close, with an accuracy of 0.8702, followed by the RNN model (0.8684), and finally the LSTM model (0.8642). It is important to note that while GRU might be the best model looking solely at accuracy, BiLSTM is extremy close (0.0001 less) it does have sligly lower loss (about 0.04) so we might want to consider it rather than GRU. Furher, we can see that while RNN does not have highest accuracy (0.002 less than the highest) it does have the lowest loss. Finally, as we can see all the models are very close in performance - looking at both loss and accuracy so we might want to also consider which model might be simples / cheapest (least computationally expensive). We might want to consider if RNN might be the best model for us to use as it is the simplest, it has lowest loss (which might mean it has best generalization compared to others) and not significantly lower accuracy compared to BiLSTM and GRU.


