# Setup

Import required libraries including Tensorflow

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from keras.regularizers import L1L2
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import random
import os
from numpy.random import seed

random.seed(42)
os.environ['PYTHONASHSEED'] = '42' 
seed(42)
tf.random.set_seed(42)

Check for Colab's GPU

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


Check connected GPU type

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jun  7 14:14:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    32W / 250W |    375MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Mount storage from Google Drive

In [4]:
from google.colab import drive
drive.mount('p2')

Mounted at p2


# Dataset

In [5]:
df = pd.read_csv('/content/p2/MyDrive/p2/data/preprocessed_500k_imba.csv', dtype={'text': 'str', 'processed_text': 'str', 'stars': float})
df.fillna('', inplace=True)
df.head()

Unnamed: 0,text,stars,processed_text
0,Three words: Damn good pastries.\n\nA few mor...,4.0,three word damn good pastry word probably best...
1,Easily one of the worst Red Robin locations. T...,0.0,easily one worst red robin location food delic...
2,Maybe I am just spoiled with good Mexican food...,1.0,maybe spoiled good mexican food growing san di...
3,This Wildflower is always kept clean and the e...,4.0,wildflower always kept clean employee nice pot...
4,Favorite bibimbap in the valley! They also hav...,4.0,favorite bibimbap valley also korean fixing sm...


# Preprocessing

One-hot encoding of star labels

In [6]:
y = tf.keras.utils.to_categorical(df["stars"].values, num_classes=5)
y

array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

Split dataset in stratified manner into train, validation and test set with proportion of 6:2:2

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df['processed_text'], 
    y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

x_train, x_val, y_train, y_val = train_test_split(
    x_train, 
    y_train, 
    test_size=0.25, 
    stratify=y_train, 
    random_state=42
)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(300000,)
(100000,)
(100000,)


Check the processed text and class

In [8]:
for i in range(5):
    print(x_train.iloc[i])
    print(y_train[i])
    
for i in range(5):
    print(x_val.iloc[i])
    print(y_val[i])

basically everything menu !had one thing !enjoy crab puppy best twist original hush puppy good vibe way around
[0. 0. 0. 0. 1.]
awesome middle eastern take hot sauce incredible love sandwich rock dish
[0. 0. 0. 0. 1.]
!rude completed order 10 minute lady drive arguing customer store literally yelled another mic !hear proceeded talk bad customer front 8 customer store !be back crew seems toxic wonder would act gm food beverage would expect far concerned update got new staff management gotten much better seems manager working hard customer service night day different glad see people striving better
[0. 0. 0. 1. 0.]
favorite dish tom kha shrimp soup order rice go along soup large enough serving meal pineapple fried rice chock full goody raisin cashew pineapple prefer shrimp meat addition tried chicken satay thai restaurant hand favorite archi would offer curry puff would !need go anywhere else best thai tea lunch special provide enough food although would like able choose soup come soup d

Tokenize the text with max vocabulary of 100,000

In [9]:
MAX_VOCAB_SIZE = 100000

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(np.concatenate((x_train, x_val, x_test), axis=0))

x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

for i in range(3):
  print(x_train[i])

[759, 75, 21, 532, 9, 60, 237, 346, 2959, 23, 1821, 881, 3296, 2959, 3, 621, 67, 97]
[133, 743, 2130, 83, 108, 38, 691, 31, 81, 853, 44]
[502, 4670, 15, 215, 73, 497, 450, 4805, 120, 474, 585, 3673, 123, 7307, 856, 1935, 764, 99, 120, 325, 602, 120, 474, 196, 11, 2292, 431, 13885, 1421, 7, 1998, 3925, 1, 1249, 7, 374, 257, 2170, 1225, 20, 106, 42, 767, 991, 47, 59, 431, 220, 529, 252, 120, 6, 65, 80, 158, 524, 129, 54, 15787, 59]


Show the top 20 most frequent tokens

In [10]:
i = 0

for word in tokenizer.word_index:
    print(f"{word} : {tokenizer.word_index[word]}")
    i += 1
    if i >= 20:
        break

food : 1
place : 2
good : 3
great : 4
time : 5
service : 6
would : 7
like : 8
one : 9
get : 10
back : 11
go : 12
really : 13
restaurant : 14
order : 15
ordered : 16
u : 17
also : 18
chicken : 19
got : 20


The actual token size in the corpus

In [11]:
vocab_size

143134

Pad the tokens to fixed length of 100

In [12]:
max_length = 100

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=max_length)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, padding='post', maxlen=max_length)
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, padding='post', maxlen=max_length)

x_train[:5, :]

array([[  759,    75,    21,   532,     9,    60,   237,   346,  2959,
           23,  1821,   881,  3296,  2959,     3,   621,    67,    97,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [  133,   743,  2130,    83,   108,    38,   691,    31,    81,
          853,    44,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

#### Word2Vec Embeddings

Populate initial word embedding weights based on pre-trained **Word2Vec Google News**

In [13]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors

In [14]:
word_vectors = KeyedVectors.load_word2vec_format('/content/p2/MyDrive/p2/data/GoogleNews-vectors-negative300.bin.gz', binary=True)

Check embeddings for example word

In [None]:
word_vectors["food"]

array([-0.18164062,  0.16503906, -0.16601562,  0.35742188, -0.09228516,
        0.20117188, -0.0546875 , -0.26171875, -0.17285156, -0.08056641,
        0.14648438, -0.24609375,  0.18652344,  0.10253906, -0.3203125 ,
        0.16699219, -0.0032196 , -0.06640625,  0.06591797, -0.109375  ,
        0.13964844, -0.05029297,  0.25390625,  0.0859375 ,  0.02026367,
        0.05517578, -0.08447266,  0.07324219,  0.15429688, -0.13867188,
       -0.25195312, -0.15136719,  0.07958984,  0.00848389, -0.24902344,
        0.05224609,  0.04394531, -0.19726562, -0.2109375 ,  0.01477051,
       -0.23632812, -0.14355469,  0.17773438,  0.26757812, -0.08789062,
       -0.07910156, -0.16113281,  0.23632812, -0.07177734,  0.08837891,
        0.07177734, -0.11962891, -0.09228516, -0.12060547, -0.00448608,
       -0.21875   , -0.05712891, -0.04418945,  0.07226562, -0.05883789,
       -0.12597656,  0.03125   , -0.24609375,  0.19140625,  0.14941406,
       -0.19335938, -0.1875    , -0.05126953,  0.03369141, -0.21

In [15]:
embedding_dim = 100

def create_embedding_matrix(word_vectors, word_index, embedding_dim):
    embedding_matrix = np.zeros((MAX_VOCAB_SIZE, embedding_dim))

    for word in word_vectors.wv.vocab:
      if word.lower() in word_index and (word_index[word.lower()] < MAX_VOCAB_SIZE):
        idx = word_index[word.lower()]
        embedding_matrix[idx] = np.array(
            word_vectors[word], dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_matrix = create_embedding_matrix(
    word_vectors,
    tokenizer.word_index,
    embedding_dim
)

print(embedding_matrix.shape)
embedding_matrix[:5]

  


(100000, 100)


array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0

Calculate the percentage of vocabs found in Word2Vec and given initial weights

In [16]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / MAX_VOCAB_SIZE

0.64722

# Modeling

### With Word2Vec initial embeddings

Build CNN-LSTM model

In [22]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=MAX_VOCAB_SIZE,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=True,
        mask_zero=True
    ),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(filters=100, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.LSTM(
        100, 
        recurrent_dropout=0.2,
        return_sequences=True
    ),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(100, activation='relu', kernel_regularizer='l2'),
    tf.keras.layers.Dense(5, activation='softmax')
])



Show summary of model

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          10000000  
                                                                 
 dropout (Dropout)           (None, 100, 100)          0         
                                                                 
 conv1d (Conv1D)             (None, 100, 100)          30100     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 100)          0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 50, 100)           80400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                        

Compile model with loss function and metrics

In [23]:
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tf.keras.metrics.CategoricalAccuracy(name='accuracy')

model.compile(loss=loss,
              optimizer=tf.keras.optimizers.Adam(0.00083),
              metrics=metrics)

Train model

In [24]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=2)
checkpoint_filepath = '/content/p2/MyDrive/p2/cnn-lstm_word2vec/checkpoint'

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

with tf.device('/device:GPU:0'):
  history = model.fit(
      x_train,
      y_train,
      epochs=8,
      validation_data=(x_val, y_val),
      callbacks=[es, model_checkpoint_callback]
  )

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 5: early stopping


Evaluate model with val set

In [25]:
model.load_weights(checkpoint_filepath)

y_pred = model.predict(x_val, verbose=1)
y_pred = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_val, axis=1), y_pred, digits=4))
confusion_matrix(np.argmax(y_val, axis=1), y_pred)

              precision    recall  f1-score   support

           0     0.7257    0.7646    0.7446     11805
           1     0.5019    0.4124    0.4528      9287
           2     0.5086    0.5161    0.5123     13362
           3     0.5479    0.5982    0.5719     26144
           4     0.7922    0.7604    0.7760     39402

    accuracy                         0.6535    100000
   macro avg     0.6152    0.6104    0.6115    100000
weighted avg     0.6556    0.6535    0.6537    100000



array([[ 9026,  1835,   595,   180,   169],
       [ 2434,  3830,  2502,   413,   108],
       [  610,  1709,  6896,  3682,   465],
       [  158,   206,  3021, 15640,  7119],
       [  210,    51,   546,  8632, 29963]])

In [26]:
y_pred = model.predict(x_test, verbose=1)
y_pred = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_test, axis=1), y_pred, digits=4))
confusion_matrix(np.argmax(y_test, axis=1), y_pred)

              precision    recall  f1-score   support

           0     0.7268    0.7708    0.7481     11805
           1     0.4882    0.3926    0.4352      9287
           2     0.5070    0.5153    0.5112     13362
           3     0.5421    0.5982    0.5688     26145
           4     0.7902    0.7537    0.7716     39401

    accuracy                         0.6497    100000
   macro avg     0.6109    0.6061    0.6070    100000
weighted avg     0.6520    0.6497    0.6498    100000



array([[ 9099,  1846,   552,   149,   159],
       [ 2434,  3646,  2627,   458,   122],
       [  609,  1716,  6886,  3667,   484],
       [  177,   216,  2993, 15641,  7118],
       [  200,    45,   523,  8935, 29698]])

### Without initial weights

In [27]:
model_2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=MAX_VOCAB_SIZE,
        output_dim=embedding_dim,
        input_length=max_length,
        trainable=True,
        mask_zero=True
    ),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(filters=100, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.LSTM(
        100, 
        recurrent_dropout=0.2,
        return_sequences=True
    ),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(100, activation='relu', kernel_regularizer='l2'),
    tf.keras.layers.Dense(5, activation='softmax')
])



In [28]:
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tf.keras.metrics.CategoricalAccuracy(name='accuracy')

model_2.compile(loss=loss,
              optimizer=tf.keras.optimizers.Adam(0.00083),
              metrics=metrics)

In [29]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=2)
checkpoint_filepath = '/content/p2/MyDrive/p2/cnn-lstm_no-embeddings/checkpoint'

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

with tf.device('/device:GPU:0'):
  history = model_2.fit(
      x_train,
      y_train,
      epochs=8,
      validation_data=(x_val, y_val),
      callbacks=[es, model_checkpoint_callback]
  )

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 5: early stopping


Evaluate using val & test set

In [30]:
model_2.load_weights(checkpoint_filepath)

y_pred = model_2.predict(x_val, verbose=1)
y_pred = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_val, axis=1), y_pred, digits=4))
confusion_matrix(np.argmax(y_val, axis=1), y_pred)

              precision    recall  f1-score   support

           0     0.6983    0.7970    0.7444     11805
           1     0.4996    0.3788    0.4309      9287
           2     0.5080    0.5035    0.5057     13362
           3     0.5515    0.5810    0.5659     26144
           4     0.7832    0.7693    0.7762     39402

    accuracy                         0.6515    100000
   macro avg     0.6081    0.6059    0.6046    100000
weighted avg     0.6495    0.6515    0.6492    100000



array([[ 9409,  1521,   535,   149,   191],
       [ 2829,  3518,  2398,   409,   133],
       [  786,  1718,  6728,  3585,   545],
       [  204,   221,  3010, 15190,  7519],
       [  246,    64,   574,  8208, 30310]])

In [31]:
y_pred = model_2.predict(x_test, verbose=1)
y_pred = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_test, axis=1), y_pred, digits=4))
confusion_matrix(np.argmax(y_test, axis=1), y_pred)

              precision    recall  f1-score   support

           0     0.7007    0.8041    0.7488     11805
           1     0.4870    0.3635    0.4163      9287
           2     0.5018    0.4990    0.5004     13362
           3     0.5484    0.5794    0.5635     26145
           4     0.7830    0.7674    0.7751     39401

    accuracy                         0.6492    100000
   macro avg     0.6042    0.6027    0.6008    100000
weighted avg     0.6469    0.6492    0.6466    100000



array([[ 9492,  1499,   525,   118,   171],
       [ 2817,  3376,  2532,   423,   139],
       [  779,  1757,  6667,  3614,   545],
       [  235,   245,  2991, 15148,  7526],
       [  223,    55,   571,  8317, 30235]])