## Setup

Install Tensorflow libraries for Deep Learning

In [None]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q tf-models-official==2.7.0

[K     |████████████████████████████████| 4.9 MB 5.2 MB/s 
[K     |████████████████████████████████| 1.8 MB 4.3 MB/s 
[K     |████████████████████████████████| 596 kB 27.1 MB/s 
[K     |████████████████████████████████| 99 kB 8.8 MB/s 
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 49.5 MB/s 
[K     |████████████████████████████████| 352 kB 53.0 MB/s 
[K     |████████████████████████████████| 237 kB 51.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 51.3 MB/s 
[K     |████████████████████████████████| 92 kB 11.0 MB/s 
[K     |████████████████████████████████| 48.3 MB 114 kB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


Import Tensorflow and other required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

Check for Colab's GPU

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


Check connected GPU type

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Jun  9 18:48:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    33W / 250W |    375MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Mount storage from Google Drive

In [None]:
from google.colab import drive
drive.mount('p2')

Mounted at p2


## Dataset

Load dataset

In [None]:
df = pd.read_csv('/content/p2/MyDrive/p2/data/reviews_500k_imba.csv')
df.head()

Unnamed: 0,business_id,stars,text,categories
0,3uC7Lbc3RgUDTWQlBu4PqQ,5.0,Three words: Damn good pastries.\n\nA few mor...,"['Desserts', 'Food', 'French', 'Sandwiches', '..."
1,c-NXKTJ0jrrusTPxJAUwvA,1.0,Easily one of the worst Red Robin locations. T...,"['American (Traditional)', 'Restaurants', 'Bur..."
2,j3csEfGzkwnXATdRoZDT-A,2.0,Maybe I am just spoiled with good Mexican food...,"['Mexican', 'Restaurants']"
3,Q0EZmATxDphzRMszNV2LVg,5.0,This Wildflower is always kept clean and the e...,"['Food', 'American (New)', 'Restaurants', 'Bre..."
4,25c15dEPrBrWr4tR1r6sTg,5.0,Favorite bibimbap in the valley! They also hav...,"['Korean', 'Japanese', 'Restaurants']"


Inspect distribution of star labels

In [None]:
df.stars.value_counts()

5.0    197007
4.0    130723
3.0     66808
1.0     59025
2.0     46437
Name: stars, dtype: int64

## Preprocess

Remove unused columns

In [None]:
df = df[['text', 'stars']]
df

Unnamed: 0,text,stars
0,Three words: Damn good pastries.\n\nA few mor...,5.0
1,Easily one of the worst Red Robin locations. T...,1.0
2,Maybe I am just spoiled with good Mexican food...,2.0
3,This Wildflower is always kept clean and the e...,5.0
4,Favorite bibimbap in the valley! They also hav...,5.0
...,...,...
499995,"New Nak Won is amazing!\n\nFirst off, super aw...",5.0
499996,I came here for lunch last Sunday. We ordered...,3.0
499997,We just tried Rkidds for the first time tonigh...,4.0
499998,"Yesterday I was served Kobe hot dogs, chipotle...",5.0


Minus 1 so the star labels range from 0-4 to prepare for one-hot encodings

In [None]:
df.loc[:,'stars'] -= 1
df

Unnamed: 0,text,stars
0,Three words: Damn good pastries.\n\nA few mor...,4.0
1,Easily one of the worst Red Robin locations. T...,0.0
2,Maybe I am just spoiled with good Mexican food...,1.0
3,This Wildflower is always kept clean and the e...,4.0
4,Favorite bibimbap in the valley! They also hav...,4.0
...,...,...
499995,"New Nak Won is amazing!\n\nFirst off, super aw...",4.0
499996,I came here for lunch last Sunday. We ordered...,2.0
499997,We just tried Rkidds for the first time tonigh...,3.0
499998,"Yesterday I was served Kobe hot dogs, chipotle...",4.0


One-hot encoding of star labels

In [None]:
y = tf.keras.utils.to_categorical(df["stars"].values, num_classes=5)
y

array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

## Modeling

Split dataset in stratified manner into train, validation and test set with proportion of 6:2:2

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df['text'], 
    y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

x_train, x_val, y_train, y_val = train_test_split(
    x_train, 
    y_train, 
    test_size=0.25, 
    stratify=y_train, 
    random_state=42
)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(300000,)
(100000,)
(100000,)


Inspect distribution of star labels in train set

In [None]:
y_train.sum(axis=0)

array([ 35415.,  27863.,  40084.,  78434., 118204.], dtype=float32)

Choose BERT model to train

In [None]:
# Talking Heads chosen
bert_model_name = 'talking-heads_base'

map_name_to_handle = {
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


Load selected BERT model and it's preprocessing Model

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

Build the BERT Model pipeline

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name="preprocessing")
    encoder_inputs = preprocessing_layer(text_input)
    
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(5, activation='softmax', name='classifier')(net)
    
    return tf.keras.Model(text_input, net)
  
classifier_model = build_classifier_model()

Compile the Model with Loss, Metrics and Optimizer functions

In [None]:
epochs = 8
steps_per_epoch = x_train.shape[0]
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tf.keras.metrics.CategoricalAccuracy(name='accuracy')
optimizer = optimization.create_optimizer(init_lr=3e-05,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                        loss=loss,
                        metrics=metrics)

Define callbacks for saving the Model state and output

In [None]:
checkpoint_filepath = '/content/p2/MyDrive/p2/' + bert_model_name + '/checkpoint'
history_filepath = '/content/p2/MyDrive/p2/' + bert_model_name + '/history.csv'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

model_history_callback = tf.keras.callbacks.CSVLogger(history_filepath)

Train the Model

In [None]:
print(f'Training model with {tfhub_handle_encoder}')

with tf.device('/device:GPU:0'):
  history = classifier_model.fit(x_train,
                                y_train,
                                validation_data= (x_val, y_val),
                                epochs=2,
                                batch_size=32,
                                callbacks=[model_checkpoint_callback,
                                           tf.keras.callbacks.CSVLogger(history_filepath)])

Training model with https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1
Epoch 1/2
Epoch 2/2


(Cont.) Train the Model

In [None]:
with tf.device('/device:GPU:0'):
  classifier_model.load_weights(checkpoint_filepath)

  history = classifier_model.fit(x_train,
                                y_train,
                                validation_data= (x_val, y_val),
                                epochs=2,
                                batch_size=32,
                                callbacks=[model_checkpoint_callback,
                                           tf.keras.callbacks.CSVLogger(history_filepath, append=True)])

Epoch 1/2
Epoch 2/2


In [None]:
with tf.device('/device:GPU:0'):
  classifier_model.load_weights(checkpoint_filepath)

  history = classifier_model.fit(x_train,
                                y_train,
                                validation_data= (x_val, y_val),
                                epochs=1,
                                batch_size=32,
                                callbacks=[model_checkpoint_callback,
                                           tf.keras.callbacks.CSVLogger(history_filepath, append=True)])



In [None]:
with tf.device('/device:GPU:0'):
  classifier_model.load_weights(checkpoint_filepath)

  history = classifier_model.fit(x_train,
                                y_train,
                                validation_data= (x_val, y_val),
                                epochs=1,
                                batch_size=32,
                                callbacks=[model_checkpoint_callback,
                                           tf.keras.callbacks.CSVLogger(history_filepath, append=True)])



## Evaluate

Evaluate the trained Model with Test set

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

classifier_model.load_weights(checkpoint_filepath)

y_pred = classifier_model.predict(x_val, verbose=1)
y_pred = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_val, axis=1), y_pred, digits=4))
confusion_matrix(np.argmax(y_val, axis=1), y_pred)

              precision    recall  f1-score   support

           0     0.7891    0.7458    0.7668     11805
           1     0.5332    0.4735    0.5016      9287
           2     0.5710    0.5849    0.5779     13362
           3     0.6043    0.5905    0.5973     26144
           4     0.7927    0.8322    0.8120     39402

    accuracy                         0.6924    100000
   macro avg     0.6581    0.6454    0.6511    100000
weighted avg     0.6893    0.6924    0.6904    100000



array([[ 8804,  2244,   536,   134,    87],
       [ 1844,  4397,  2630,   353,    63],
       [  352,  1405,  7816,  3410,   379],
       [   84,   153,  2424, 15437,  8046],
       [   73,    47,   282,  6211, 32789]])

In [None]:
y_pred = classifier_model.predict(x_test, verbose=1)
y_pred = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_test, axis=1), y_pred, digits=4))
confusion_matrix(np.argmax(y_test, axis=1), y_pred)

              precision    recall  f1-score   support

           0     0.7910    0.7532    0.7716     11805
           1     0.5272    0.4627    0.4928      9287
           2     0.5687    0.5840    0.5763     13362
           3     0.6062    0.5932    0.5996     26145
           4     0.7934    0.8316    0.8120     39401

    accuracy                         0.6927    100000
   macro avg     0.6573    0.6449    0.6505    100000
weighted avg     0.6894    0.6927    0.6906    100000



array([[ 8892,  2202,   487,   133,    91],
       [ 1809,  4297,  2769,   349,    63],
       [  373,  1436,  7804,  3346,   403],
       [   85,   165,  2410, 15509,  7976],
       [   83,    51,   253,  6249, 32765]])