In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from tqdm.notebook import tqdm
import nltk, re
import seaborn as sns
import string
import os, gc
import random
import keras

tqdm.pandas()
base_dir = "/content/drive/Shareddrives/602Project/Assigns"
os.listdir(base_dir)

['dataset.zip',
 'imdb_movies_raw.csv.gz',
 'imdb_reviews_raw.csv.gz',
 'word2vec.gensim',
 'lda.gensim.state',
 'lda.gensim',
 'lda.gensim.expElogbeta.npy',
 'lda.gensim.id2word',
 'imdb_movies_train.csv.gz',
 'imdb_reviews_train.csv.gz',
 'imdb_movies_val.csv.gz',
 'imdb_movies_test.csv.gz',
 'imdb_reviews_val.csv.gz',
 'imdb_reviews_test.csv.gz']

In [None]:
rev_train = pd.read_csv(f"{base_dir}/imdb_reviews_train.csv.gz", compression="gzip")
rev_val = pd.read_csv(f"{base_dir}/imdb_reviews_val.csv.gz", compression="gzip")
rev_test = pd.read_csv(f"{base_dir}/imdb_reviews_test.csv.gz", compression="gzip")
print("Reviews data:")
print(rev_train.shape, rev_train.columns)
print(rev_val.shape, rev_test.shape)

Reviews data:
(507092, 11) Index(['username', 'rating', 'helpful', 'total', 'date', 'title', 'review',
       'year', 'name', 'emojis', 'review_words'],
      dtype='object')
(142432, 11) (142018, 11)


This notebook explores Neural Networks (using w2v) & Deep Learning Models.

In [None]:
def get_target_var(x):
    
    if x < 5:
        return "Negative"
    
    elif x>=5 and x <= 7:
        return "Neutral"
    
    elif x >= 8:
        return "Positive"
    
    return x


def reformat_data(rev_df):
    
    rev_df = rev_df.dropna(subset=['review_words']).reset_index(drop=1)
    rev_df['review_words'] = rev_df['review_words'].str.split(r"\|")
    rev_df['review_words'] = rev_df['review_words'].str.join(" ")
    rev_df['sentiment'] = rev_df['rating'].apply(get_target_var)
    return rev_df


rev_train = reformat_data(rev_train)
rev_val = reformat_data(rev_val)
rev_test = reformat_data(rev_test)

rev_train.shape, rev_val.shape, rev_test.shape

((507092, 12), (142432, 12), (142018, 12))

In [None]:
label_map = {'Negative': 0,
  'Neutral': 1,
  'Positive': 2} 

y_train = rev_train['sentiment'].replace(label_map).values
y_val = rev_val['sentiment'].replace(label_map).values
y_test = rev_test['sentiment'].replace(label_map).values

Reference: Tf.keras.preprocessing.text.tokenizer &nbsp;: &nbsp; Tensorflow core v2.8.0. TensorFlow. (n.d.). Retrieved May 11, 2022, from https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer 

In [None]:
from keras.preprocessing.text import Tokenizer
MAX_WORDS = 11**5
tok = Tokenizer(num_words=MAX_WORDS)
tok.fit_on_texts(rev_train['review'])
vocab_size = tok.num_words + 1  
print("Number of Unique Words (vocab):", vocab_size)

Number of Unique Words (vocab): 161052


In [None]:
from keras.preprocessing.sequence import pad_sequences

def preprocess_text(t, max_len=512):
    seq = tok.texts_to_sequences(t)
    return pad_sequences(seq, padding='pre', maxlen=max_len)

MAXLEN = 512
X_train = preprocess_text(rev_train['review'], MAXLEN)
X_val = preprocess_text(rev_val['review'], MAXLEN)
X_test = preprocess_text(rev_test['review'], MAXLEN)

X_train.shape, X_val.shape, X_test.shape

((507092, 512), (142432, 512), (142018, 512))

Reference: Pennington, J. (n.d.). Glove: Global vectors for word representation. Retrieved May 11, 2022, from https://nlp.stanford.edu/projects/glove/ 

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip
!ls

--2022-05-08 21:57:25--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-05-08 21:57:25--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-05-08 21:57:26--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

Reference: Team, K. (n.d.). Keras documentation: Using pre-trained word embeddings. Keras. Retrieved May 11, 2022, from https://keras.io/examples/nlp/pretrained_word_embeddings/ 

In [None]:

embed_dim = 50
glove_fp = "glove.6B.50d.txt"

embed_index = {}
with open(glove_fp) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embed_index[word] = coefs

print(f"{len(embed_index)} word vectors found.")


embed_weights = np.zeros((vocab_size, embed_dim))
for word, i in tok.word_index.items():
    embed_vector = embed_index.get(word)
    if i > MAX_WORDS:
        break
    if embed_vector is not None:
        embed_weights[i] = embed_vector
        

print("Embedding matrix with pretrained vectors ready:", embed_weights.shape)
embed_init = keras.initializers.Constant(embed_weights)

400000 word vectors found.
Embedding matrix with pretrained vectors ready: (161052, 50)


Reference: Team, K. (n.d.). Keras Documentation: The sequential model. Keras. Retrieved May 11, 2022, from https://keras.io/guides/sequential_model/ 

In [None]:

from keras.models import Sequential
from keras import layers

def build_cnn_model():
    text_cnn = Sequential()
    text_cnn.add(layers.Embedding(vocab_size, embed_dim, input_length=MAXLEN, 
                                  embeddings_initializer=embed_init))
    text_cnn.add(layers.Conv1D(64, kernel_size=3, activation='relu'))
    text_cnn.add(layers.GlobalMaxPooling1D())
    text_cnn.add(layers.Dense(16, activation='relu'))
    text_cnn.add(layers.Dense(3, activation='softmax'))
    text_cnn.compile(optimizer='rmsprop',
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])
    # text_cnn.summary()
    return text_cnn

def build_gru_model():
    gru = Sequential()
    gru.add(layers.Embedding(vocab_size, embed_dim, input_length=MAXLEN,
                                  embeddings_initializer=embed_init))
    gru.add(layers.GRU(64))
    gru.add(layers.Dropout(0.2))
    gru.add(layers.Dense(16, activation='relu'))
    gru.add(layers.Dense(3, activation='softmax'))
    gru.compile(optimizer='rmsprop',
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])
    # gru.summary()
    return gru


def build_lstm_model():
    lstm = Sequential()
    lstm.add(layers.Embedding(vocab_size, embed_dim, input_length=MAXLEN,
                                  embeddings_initializer=embed_init))
    lstm.add(layers.LSTM(64))
    lstm.add(layers.Dropout(0.4))
    lstm.add(layers.Dense(16, activation='relu'))
    lstm.add(layers.Dense(3, activation='softmax'))
    lstm.compile(optimizer='rmsprop',
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])
    # lstm.summary()
    return lstm

In [None]:
all_clf = {'CNN_Model': build_cnn_model(),
          'GRU_Model': build_gru_model(),
          'LSTM_Model': build_lstm_model(),}

In [None]:
model_hist = {}
test_acc = {}
for key in all_clf:
    print("-"*100)
    print(f"Training {key}...")
    model = all_clf[key]
    early_stop = keras.callbacks.EarlyStopping(monitor='val_acc', patience=3)
    mcp = keras.callbacks.ModelCheckpoint(f"{key}.h5", monitor='val_acc', mode="max",
                                          save_best_only=True)
    hist = model.fit(X_train, y_train, epochs=10, batch_size=64, 
              validation_data=(X_val, y_val), callbacks=[early_stop, mcp])
    tloss, tacc = model.evaluate(X_test, y_test)
    model_hist[key] = hist
    test_acc[key] = tacc

----------------------------------------------------------------------------------------------------
Training CNN_Model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
----------------------------------------------------------------------------------------------------
Training GRU_Model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
----------------------------------------------------------------------------------------------------
Training LSTM_Model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
for key in model_hist:
    print("-"*50)
    print(key)
    avg_val_acc = np.mean(model_hist[key].history['val_accuracy'])
    print("Average Valiation Accuracy:", round(avg_val_acc,3))
    print("Accuracy on Test data:", round(test_acc[key],3))

--------------------------------------------------
CNN_Model
Average Valiation Accuracy: 0.789
Accuracy on Test data: 0.775
--------------------------------------------------
GRU_Model
Average Valiation Accuracy: 0.812
Accuracy on Test data: 0.804
--------------------------------------------------
LSTM_Model
Average Valiation Accuracy: 0.811
Accuracy on Test data: 0.812
