## Import the necessary Libraries

In [1]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l- \ done
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=e515ca795e00d0acf127e12eaa1985e13a85e16493dcdcd055d3508709c6aabb
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
[0m

In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import numpy as np
import wget
import tarfile
from zipfile import ZipFile
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

from sklearn.metrics import classification_report

## Read the data

In [3]:
%%time
for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
    #print(os.path.join(dirname, filenames))
    print(dirname)

/kaggle/input
/kaggle/input/imdb-review
/kaggle/input/imdb-review/aclImdb
/kaggle/input/imdb-review/aclImdb/test
/kaggle/input/imdb-review/aclImdb/test/pos
/kaggle/input/imdb-review/aclImdb/test/neg
/kaggle/input/imdb-review/aclImdb/train
/kaggle/input/imdb-review/aclImdb/train/pos
/kaggle/input/imdb-review/aclImdb/train/neg
/kaggle/input/imdb-review/aclImdb/train/unsup
CPU times: user 1.75 s, sys: 3.46 s, total: 5.21 s
Wall time: 1min 40s


## GloVe
[Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/)

In [4]:
%%time
if not os.path.exists('Data/glove.6B'):
    os.mkdir(os.getcwd()+'/Data/')
        
    url='http://nlp.stanford.edu/data/glove.6B.zip'  
    wget.download(url,'Data')  
  
    temp='Data/glove.6B.zip' 
    file = ZipFile(temp)  
    file.extractall('Data/glove.6B') 
    file.close()

CPU times: user 24.6 s, sys: 7.54 s, total: 32.1 s
Wall time: 3min 2s


In [5]:
GLOVE_DIR = 'Data/glove.6B'
TRAIN_DATA_DIR = '/kaggle/input/imdb-review/aclImdb/train'
TEST_DATA_DIR = '/kaggle/input/imdb-review/aclImdb/test'

In [6]:
#Within these, I only have a pos/ and a neg/ folder containing text files 
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.2

In [7]:
#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.
def get_data(data_dir):
    texts = []  # list of text samples
    labels_index = {'pos':1, 'neg':0}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    for name in sorted(os.listdir(data_dir)):
        path = os.path.join(data_dir, name)
        if os.path.isdir(path):
            if name=='pos' or name=='neg':
                label_id = labels_index[name]
                for fname in sorted(os.listdir(path)):
                        fpath = os.path.join(path, fname)
                        text = open(fpath,encoding='utf8').read()
                        texts.append(text)
                        labels.append(label_id)
    return texts, labels

In [8]:
%%time
train_texts, train_labels = get_data(TRAIN_DATA_DIR)
test_texts, test_labels = get_data(TEST_DATA_DIR)
labels_index = {'pos':1, 'neg':0} 

#Just to see how the data looks like. 
print('*'*30,'Training Data','*'*30)
print('Size >>>',len(train_texts),len(train_labels))
print('Review >>>',train_texts[0])
print('Label >>>',train_labels[0])

print('*'*30,'Testing Data','*'*30)
print('Size >>>',len(test_texts),len(test_labels))
print('Review >>>',test_texts[24999])
print('Label >>>',test_labels[24999])

****************************** Training Data ******************************
Size >>> 25000 25000
Review >>> Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.
Label >>> 0
****************************** Testing Data ******************************
Size >>> 25000 25000
Review >>> I've seen this story before but my kids haven't. Boy with troubled past joins military, faces his past, falls in love

## Preprocessing

In [9]:
%%time
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer 
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data. 
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) 
tokenizer.fit_on_texts(train_texts) 
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes 
test_sequences = tokenizer.texts_to_sequences(test_texts) 
word_index = tokenizer.word_index 
print('Found %s unique tokens.' % len(word_index))

Found 88582 unique tokens.
CPU times: user 14.8 s, sys: 0 ns, total: 14.8 s
Wall time: 14.8 s


In [10]:
%%time
#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier
#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
training_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
testing_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
training_labels = to_categorical(np.asarray(train_labels))
testing_labels = to_categorical(np.asarray(test_labels))

print('Training Text :',training_data.shape,'Label :', training_labels.shape)
print('Testing Text :',testing_data.shape,'Label :', testing_labels.shape)

Training Text : (25000, 1000) Label : (25000, 2)
Testing Text : (25000, 1000) Label : (25000, 2)
CPU times: user 847 ms, sys: 27.2 ms, total: 874 ms
Wall time: 879 ms


### Create a validation set 
Create a validation set from testing data because I don't want to decrease the data for training

In [11]:
# Shuffle the Training Data
indices = np.arange(training_data.shape[0])        # Indices from 0 to 25000
np.random.shuffle(indices)                    # Shuffle the indices  

# Arrange the Training data and Label according to shuffled indices
X = training_data[indices]       
y = training_labels[indices]

# Shuffle the Testing Data
indices = np.arange(testing_data.shape[0])        # Indices from 0 to 25000
np.random.shuffle(indices)                    # Shuffle the indices  

# Arrange the Training data and Label according to shuffled indices
Test = testing_data[indices]       
Test_labels = testing_labels[indices]

In [12]:
# split the training data into a training set and a validation set
VALIDATION_SPLIT = 0.4

num_validation_samples = int(VALIDATION_SPLIT * Test.shape[0])
x_test = Test[:-num_validation_samples]
y_test = Test_labels[:-num_validation_samples]
x_val = Test[-num_validation_samples:]
y_val = Test_labels[-num_validation_samples:]

print('Training :',X.shape, y.shape)
print('Validation :',x_val.shape, y_val.shape)
print('Testing :',x_test.shape, y_test.shape)

Training : (25000, 1000) (25000, 2)
Validation : (10000, 1000) (10000, 2)
Testing : (15000, 1000) (15000, 2)


## Embedding

In [13]:
%%time
print('GloVe embedding index.')

# first, build index mapping words in the embeddings set
# to their embedding vector
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))
print('\n GloVe Embedding Index for word good\n',embeddings_index["good"])

GloVe embedding index.
Found 400000 word vectors in Glove embeddings.

 GloVe Embedding Index for word good
 [-0.030769   0.11993    0.53909   -0.43696   -0.73937   -0.15345
  0.081126  -0.38559   -0.68797   -0.41632   -0.13183   -0.24922
  0.441      0.085919   0.20871   -0.063582   0.062228  -0.051234
 -0.13398    1.1418     0.036526   0.49029   -0.24567   -0.412
  0.12349    0.41336   -0.48397   -0.54243   -0.27787   -0.26015
 -0.38485    0.78656    0.1023    -0.20712    0.40751    0.32026
 -0.51052    0.48362   -0.0099498 -0.38685    0.034975  -0.167
  0.4237    -0.54164   -0.30323   -0.36983    0.082836  -0.52538
 -0.064531  -1.398     -0.14873   -0.35327   -0.1118     1.0912
  0.095864  -2.8129     0.45238    0.46213    1.6012    -0.20837
 -0.27377    0.71197   -1.0754    -0.046974   0.67479   -0.065839
  0.75824    0.39405    0.15507   -0.64719    0.32796   -0.031748
  0.52899   -0.43886    0.67405    0.42136   -0.11981   -0.21777
 -0.29756   -0.1351     0.59898    0.46529   -0.

### Prepare Embedding Matrix - rows are the words from `word_index`, columns are the `embeddings` of that word from `glove`.

In [14]:
%%time
# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
print('Total Unique words :', len(word_index))
print('MAX_NUM_WORDS :',MAX_NUM_WORDS)

num_words = min(MAX_NUM_WORDS, len(word_index)) + 1

print('num_words :',num_words)
print('EMBEDDING DIMENSION :',EMBEDDING_DIM)

# Embedding Matrix
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        

print('Shape :',embedding_matrix.shape)
embedding_matrix[-1]

Total Unique words : 88582
MAX_NUM_WORDS : 20000
num_words : 20001
EMBEDDING DIMENSION : 100
Shape : (20001, 100)
CPU times: user 57.3 ms, sys: 6.35 ms, total: 63.6 ms
Wall time: 63.5 ms


array([ 6.24240004e-02,  4.34909999e-01,  3.00370008e-01,  4.44570005e-01,
       -9.42979977e-02, -3.17759991e-01,  2.72410005e-01, -3.62569988e-01,
        2.22739995e-01, -4.52920012e-02, -3.80400002e-01, -3.78809988e-01,
        1.15240000e-01, -3.04749995e-01,  2.74960011e-01, -1.02519996e-01,
        2.68949997e-02,  3.90190005e-01,  1.63230002e-01, -3.04529995e-01,
       -2.49699995e-01, -1.89339995e-01, -3.94400001e-01,  1.98559999e-01,
        2.46959999e-01, -1.63849995e-01,  6.17460012e-01,  8.77669975e-02,
       -5.97299993e-01, -6.85209990e-04, -2.46030003e-01,  6.88920021e-01,
       -5.49839973e-01, -4.70319986e-01, -2.33870000e-01, -2.55470008e-01,
       -4.17580009e-01, -6.52879998e-02,  4.72970009e-01, -7.10850000e-01,
       -6.06859982e-01,  3.24770004e-01, -5.09440005e-01,  4.55060005e-01,
       -9.41169977e-01,  3.30590010e-02,  3.65139991e-01,  6.98379993e-01,
       -4.91439998e-01,  1.73319995e-01, -2.79179990e-01,  1.15189999e-02,
       -2.01749995e-01,  

In [15]:
# load these pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
print("Preparing of embedding matrix is done")

Preparing of embedding matrix is done


## 1D CNN Model with pre-trained embedding

In [16]:
print('Define a 1D CNN model.')

cnnmodel = Sequential()
cnnmodel.add(embedding_layer)
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(len(labels_index), activation='softmax'))

cnnmodel.compile(loss='categorical_crossentropy',
                 optimizer='rmsprop',
                 metrics=['acc'])

cnnmodel.summary()

Define a 1D CNN model.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 100)         2000100   
_________________________________________________________________
conv1d (Conv1D)              (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 199, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 35, 128)           82048     
_________________________________________________________________
global_max_pooling1d (Global (Non

2022-11-18 11:26:14.986346: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [17]:
%%time
#Train the model. Tune to validation set. 

cnnmodel.fit(X, y,
             batch_size=128,
             epochs=1, 
             validation_data=(x_val, y_val))
#Evaluate on test set:
score, acc = cnnmodel.evaluate(x_test, y_test)
print('Test accuracy with CNN:', acc)

2022-11-18 11:26:15.432426: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Test accuracy with CNN: 0.7801333069801331
CPU times: user 9min 39s, sys: 2.27 s, total: 9min 41s
Wall time: 2min 39s


## 1D CNN model with training our own embedding

In [18]:
print("Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings")
cnnmodel1 = Sequential()
cnnmodel1.add(Embedding(MAX_NUM_WORDS, 128))
cnnmodel1.add(Conv1D(128, 5, activation='relu'))
cnnmodel1.add(MaxPooling1D(5))
cnnmodel1.add(Conv1D(128, 5, activation='relu'))
cnnmodel1.add(MaxPooling1D(5))
cnnmodel1.add(Conv1D(128, 5, activation='relu'))
cnnmodel1.add(GlobalMaxPooling1D())
cnnmodel1.add(Dense(128, activation='relu'))
cnnmodel1.add(Dense(len(labels_index), activation='softmax'))

cnnmodel1.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

cnnmodel1.summary()

Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, None, 128)         82048     
_________

In [19]:
%%time
#Train the model. Tune to validation set. 
cnnmodel1.fit(X, y,
             batch_size=128,
             epochs=1, 
             validation_data=(x_val, y_val))
#Evaluate on test set:
score, acc = cnnmodel1.evaluate(x_test, y_test)
print('Test accuracy with CNN:', acc)

Test accuracy with CNN: 0.8708666563034058
CPU times: user 14min 28s, sys: 2.82 s, total: 14min 31s
Wall time: 3min 55s


## LSTM Model using pre-trained Embedding Layer

In [20]:
print("Defining and training an LSTM model, using pre-trained embedding layer")

rnnmodel = Sequential()
rnnmodel.add(embedding_layer)
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(2, activation='sigmoid'))

rnnmodel.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])

rnnmodel.summary()

Defining and training an LSTM model, using pre-trained embedding layer
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 100)         2000100   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
Total params: 2,117,606
Trainable params: 117,506
Non-trainable params: 2,000,100
_________________________________________________________________


In [21]:
%%time
print('Training the RNN')

rnnmodel.fit(X, y,
             batch_size=32,
             epochs=1,
             validation_data=(x_val, y_val))

score, acc = rnnmodel.evaluate(x_test, y_test,
                               batch_size=32)
print('Test accuracy with RNN:', acc)

Training the RNN
Test accuracy with RNN: 0.8025333285331726
CPU times: user 1h 25min 36s, sys: 21min 43s, total: 1h 47min 19s
Wall time: 1h 12min 29s


## LSTM Model with training own embedding

In [22]:
print("Defining and training an LSTM model, training embedding layer on the fly")

#model
rnnmodel2 = Sequential()
rnnmodel2.add(Embedding(MAX_NUM_WORDS, 128))
rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel2.add(Dense(2, activation='sigmoid'))

rnnmodel2.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

rnnmodel2.summary()

Defining and training an LSTM model, training embedding layer on the fly
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 258       
Total params: 2,691,842
Trainable params: 2,691,842
Non-trainable params: 0
_________________________________________________________________


In [23]:
%%time
print('Training the RNN')

rnnmodel2.fit(X, y,
             batch_size=32,
             epochs=2,
             validation_data=(x_val, y_val))

score, acc = rnnmodel2.evaluate(x_test, y_test,
                                batch_size=32)
print('Test accuracy with RNN:', acc)

Training the RNN
Epoch 1/2
Epoch 2/2
Test accuracy with RNN: 0.819599986076355
CPU times: user 3h 5min 20s, sys: 44min 23s, total: 3h 49min 43s
Wall time: 2h 32min 45s


## Classification Report

In [24]:
y_pred = rnnmodel2.predict(x_test)

pred = [np.argmax(element) for element in y_pred]
print(classification_report(pred, y_test[:,1], target_names = ['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.75      0.87      0.81      6428
    Positive       0.89      0.78      0.83      8572

    accuracy                           0.82     15000
   macro avg       0.82      0.83      0.82     15000
weighted avg       0.83      0.82      0.82     15000

