In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import CountVectorizer

tf.random.set_seed(42)

## IMDB movie reviews

## Retrieving and preparing the Data

We will work with the IMDb movie reviews data.

In [2]:
# Read in the IMDB Dataset into "data". Do not set an index column

# YOUR CODE HERE
data = pd.read_csv("IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Replace all "negative" and "positive" sentiment values with o and 1 respectively.
# You can use a simple logical operator instead of label encodeing. 

# YOUR CODE HERE
data["sentiment"] = np.where(data["sentiment"]== "negative", 0, 1)

In [5]:
# Get the dependent data and assign to y
# YOUR CODE HERE
y = data["sentiment"]
X = data["review"]
print(y[0:10])

0    1
1    1
2    1
3    0
4    1
5    1
6    1
7    0
8    0
9    1
Name: sentiment, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

# Split the X data (data['review']) and y data into X_train, X_test, y_train, and y_test
# With a test size of 0.2 and a random_state of 42

# YOUR CODE HERE
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [7]:
print(f"""
Train samples: {X_test.shape[0]}
Test samples: {y_test.shape[0]}
"""
)


Train samples: 10000
Test samples: 10000



In [8]:
y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

Inspect the frequence of each sentiment in the traning dataset (it is balanced!)

In [9]:
# Calculate the training data's frequency and assign the output to "frequency"

# YOUR CODE HERE
frequency = y_train.value_counts(normalize = True)

print(frequency)

0    0.500975
1    0.499025
Name: sentiment, dtype: float64


In [10]:
# Let's turn the target into a dummy vector

# YOUR CODE HERE
y_train = pd.get_dummies(y_train).to_numpy()
y_test = pd.get_dummies(y_test).to_numpy()


In [11]:
y_train.shape

(40000, 2)

## Unigram Multi-hot Encoding Baseline

Next, let us see the performance of a neural net that is trained from the scratch using multi-hot encoding. 

In [12]:
# Set the maximum number of tokens to 2412. 
# Also set up our Text Vectorization layer using multi-hot encoding

# YOUR CODE HERE
max_tokens = 2412 
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot")

In [13]:
# The vocabulary that will be indexed is given by the text corpus on our train dataset
# YOUR CODE HERE
text_vectorization.adapt(X_train)

In [14]:
# We vectorize our input
# YOUR CODE HERE
X_train = text_vectorization(X_train)
X_test = text_vectorization(X_test)

In [15]:
# Now create your model. start with 32 dense relu layers, a dropout layer of 0.5, and a final softmax layer

# YOUR CODE HERE
inputs = keras.Input(shape=(max_tokens, ))
x = keras.layers.Dense(32, activation="relu")(inputs)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs, outputs)

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 2412)]            0         
_________________________________________________________________
dense (Dense)                (None, 32)                77216     
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 77,282
Trainable params: 77,282
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Compile your model

# YOUR CODE HERE
model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])

In [17]:
# YOUR CODE HERE
model.fit(x=X_train, y=y_train,
          epochs=10,
          batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f17204ba828>

In [18]:
# Evaluate your model. You should be able to get your model to 85% at this point
# YOUR CODE HERE
model.evaluate(X_test,y_test)




[0.34028443694114685, 0.8680999875068665]

## Extend Baseline Model

Let's create more complex models to increase the accuracy on our test sample. Try combining different models by changing:
- Number of hidden units
- Adding another hidden layer.
- Changing the number of epochs.
- Using bigrams instead of unigrams.

To guide your search for the best parameters, note how the accuracy changes on both train and test data.

In [19]:
# Begin your model here

# YOUR CODE HERE

hidden_units = [64,128]
no_epochs = [15,30]
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot",
    ngrams = 2)
y_train_new = pd.get_dummies(y_train).to_numpy()
y_test_new = pd.get_dummies(y_test).to_numpy()
text_vectorization.adapt(X_train)
X_train_vec = text_vectorization(X_train)
X_test_vec = text_vectorization(X_test)
best_params = {"Hidden Units":128 , "Epochs": 15}
for h_units in hidden_units:
    for epochs in no_epochs:
        inputs = keras.Input(shape=(max_tokens, ))
        x = keras.layers.Dense(h_units, activation="relu")(inputs)
        x = keras.layers.Dropout(0.5)(x)
        x = keras.layers.Dense(16)(x)
        x = keras.layers.Dropout(0.5)(x)
        outputs = keras.layers.Dense(2, activation="softmax")(x)
        model = keras.Model(inputs, outputs)
        model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])
        h = model.fit(x=X_train_vec, y=y_train_new,
          epochs=epochs,
          batch_size=32)
        #print(f"Training accuracy for {h_units} hidden unit and {epochs} epochs is: {h.accuracy}") 
        test_accuracy = model.evaluate(X_test_vec, y_test_new)
        print(f"Testing accuracy for {h_units} hidden unit and {epochs} epochs is:{test_accuracy[1]}")


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Testing accuracy for 64 hidden unit and 15 epochs is:0.8665000200271606
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Testing accuracy for 64 hidden unit and 30 epochs is:0.8665000200271606
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Testing accuracy for 128 hidden unit and 15 epochs is:0.873199999332428
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epo