**Installing Transformer for BERT Model and mount google drive for dataset**

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 7.7MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 24.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 64.3MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=3c30cec419d292e40ca

In [4]:
from google.colab import drive
drive.mount('/content/drive')
projectFolder = "./drive/My Drive/Colab Notebooks/AmazonReview"

Mounted at /content/drive


**LOAD DATASET AND PRE-PROCESS**

In [5]:
import pandas as pd
import os
import numpy as np

#Load training set, csv file is converted from train.json
# pandas dataframe is easier to work with than json format in Tensorflow frame work
train = pd.read_csv(os.path.join(projectFolder, "train_converted.csv"))

# combine several input together, category+summary + user ID + item Id + review text string together. Separate by space.
train['reviewText']=train['category'] +' ' + train['summary']+ " " + train['reviewerID'] + ' '+train['itemID'] +" "+ train['reviewText'] 

#remove empty review samples
train['reviewText'].replace('', np.nan, inplace=True)
train.dropna(subset=['reviewText'], inplace=True)
train.head(10)

Unnamed: 0,overall,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,price,itemID,reviewHash
0,4.0,"08 24, 2010",u04428712,Pop Amazing that I Actually Bought This...More...,Amazing that I Actually Bought This...More Ama...,1282608000,Pop,$35.93,p70761125,85559980
1,5.0,"10 31, 2009",u06946603,Alternative Rock Excellent album u06946603 p85...,Excellent album,1256947200,Alternative Rock,$11.28,p85427891,41699565
2,4.0,"10 13, 2015",u92735614,"Pop Love the Music, Hate the Light Show u92735...","Love the Music, Hate the Light Show",1444694400,Pop,$89.86,p82172532,24751194
3,5.0,"06 28, 2017",u35112935,Pop Great u35112935 p15255251 Finally got it ....,Great,1498608000,Pop,$11.89,p15255251,22820631
4,4.0,"10 12, 2015",u07141505,Jazz Love these guys. u07141505 p82618188 Look...,Love these guys.,1444608000,Jazz,$15.24,p82618188,53377470
5,5.0,"09 7, 2015",u07624734,Pop Five Stars u07624734 p78489708 o.k.,Five Stars,1441584000,Pop,$14.99,p78489708,23609516
6,5.0,"06 25, 2016",u64810771,Jazz Bought this album while sitting at a Bone...,Bought this album while sitting at a Boney Jam...,1466812800,Jazz,$8.73,p58524163,11806672
7,5.0,"03 3, 2016",u88679770,Pop Five Stars u88679770 p92272123 great,Five Stars,1456963200,Pop,$16.98,p92272123,76307426
8,5.0,"12 4, 2013",u77782870,Pop GET IT u77782870 p11658191 Kelly sounds gr...,GET IT,1386115200,Pop,$6.98,p11658191,15086745
9,5.0,"12 19, 2013",u96436250,Classical TWO GREAT CONCERTOS u96436250 p80741...,TWO GREAT CONCERTOS,1387411200,Classical,$28.69,p80741971,23701038


**TOKENIZE IF TOKENS NOT ALREADY SAVED, OTHERWISE SKIP THIS STEP AND MOVE TO THE NEXT**

In [None]:
# BERT Max Seq. Length
SEQ_LEN = 128

# Tokenize the input data
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-cased")

# Apply Tokenizer
input_ids = []
attention_masks = []
labels = []

# NEED A MORE EFFICIENT WAY THAN THIS
for (idx, row) in train.iterrows():
  if row.reviewText != "":
    sample = tokenizer(row.reviewText, max_length=SEQ_LEN, padding='max_length', truncation=True)
    input_ids.append(sample['input_ids'])
    attention_masks.append(sample['attention_mask'])
    labels.append(row.overall)

len(input_ids), len(attention_masks), len(labels)

input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(labels)

input_ids, attention_masks, labels

(array([[  101,  7312, 16035, ...,   119,  2456,   102],
        [  101, 13069,  2977, ...,   117,  1105,   102],
        [  101,  7312,  2185, ...,  1147,   107,   102],
        ...,
        [  101, 10018, 22161, ...,  5098,  4035,   102],
        [  101, 13069,  2977, ...,     0,     0,     0],
        [  101,  7312, 26707, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]),
 array([4., 5., 4., ..., 3., 5., 5.]))

In [None]:
# Save the bert tokens in google drive
import pickle

pickle_inp_path= os.path.join(projectFolder, "tokenized/bert_inp5.pkl")
pickle_mask_path= os.path.join(projectFolder, "tokenized/bert_mask5.pkl")
pickle_label_path= os.path.join(projectFolder, "tokenized/bert_label5.pkl")

pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((labels),open(pickle_label_path,'wb'))

**Load previously saved pickle file if the input text feature hasn't changed**

In [6]:
# Load pickle files previously saved
import pickle

print('Preparing the pickle file.....')
pickle_inp_path= os.path.join(projectFolder, "tokenized/bert_inp5.pkl")
pickle_mask_path= os.path.join(projectFolder, "tokenized/bert_mask5.pkl")
pickle_label_path= os.path.join(projectFolder, "tokenized/bert_label5.pkl")

print('Loading the saved pickle files..')

input_ids = pickle.load(open(pickle_inp_path, 'rb'))
attention_masks = pickle.load(open(pickle_mask_path, 'rb'))
labels = pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

Preparing the pickle file.....
Loading the saved pickle files..
Input shape (199998, 128) Attention mask shape (199998, 128) Input label shape (199998,)


**SPLIT DATA AND TRAIN**

In [7]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y, train_mask, test_mask = train_test_split(input_ids, labels, attention_masks, test_size=0.1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape, train_mask.shape, test_mask.shape)

(179998, 128) (20000, 128) (179998,) (20000,) (179998, 128) (20000, 128)


In [8]:
# Load Model
import tensorflow as tf
import keras

# Num classes
NUM_CLASSES = 1

from transformers import TFBertForSequenceClassification
bert_model = TFBertForSequenceClassification.from_pretrained('bert-large-cased',num_labels=NUM_CLASSES)

log_dir = os.path.join(projectFolder, 'tensorboard_data/tb_bert')
model_save_path = os.path.join(projectFolder, 'model/model.h5')

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path, save_weights_only=True,
                                                monitor='val_loss', mode='min', save_best_only=True), 
             keras.callbacks.TensorBoard(log_dir=log_dir)]
loss = tf.keras.losses.MeanSquaredError()
metric = tf.keras.metrics.MeanAbsoluteError('mae')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
print(bert_model.summary())

print('\nBert Model',bert_model.summary())



bert_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1460062736.0, style=ProgressStyle(descr…




Some layers from the model checkpoint at bert-large-cased were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier', 'dropout_73']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  333579264 
_________________________________________________________________
dropout_73 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1025      
Total params: 333,580,289
Trainable params: 333,580,289
Non-trainable params: 0
_________________________________________________________________
None
Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  333579264 
_________________________________________________________________
dropout_73 (Dropout)         mult

In [9]:
#Train BERT Model, 2 epochs is enough as the MSE pleateau in the middle of 2nd epoch.
history=bert_model.fit([train_x, train_mask], train_y, batch_size=16, epochs=2, verbose= 1, validation_data=([test_x, test_mask], test_y), callbacks=callbacks)

Epoch 1/2
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/2


In [11]:
checkpoint_path = os.path.join(projectFolder, 'model/cp-{epoch:02d}.ckpt')
checkpoint_dir = os.path.dirname(checkpoint_path)

In [13]:
bert_model.save(os.path.join(projectFolder, 'model/'), save_format='tf')
bert_model.save_weights(checkpoint_path.format(epoch=2))

INFO:tensorflow:Assets written to: ./drive/My Drive/Colab Notebooks/AmazonReview/model/assets


In [25]:
# A trick to keep Google colab running without shutdown kernel due to inactivity
# Great for overnight training without losing result on the 2nd morning
while True:pass

KeyboardInterrupt: ignored

**Test Validation**

In [None]:
# LOAD trained BERT MODEL
import keras
from transformers import TFBertForSequenceClassification
reconstructed_model = TFBertForSequenceClassification.from_pretrained('bert-large-cased',num_labels=1)
reconstructed_model.load_weights(checkpoint_path.format(epoch=2))
reconstructed_model = keras.models.load_model(os.path.join(projectFolder, 'model/'), compile=True)
#reloaded_result = reconstructed_model([test_x, test_mask], training=False)
#reconstructed_model.summary()

In [29]:
#Compute prediction score on test set
print(test_y.shape)
test_prediction = bert_model.predict([test_x, test_mask], batch_size=16)
test_prediction_flat = np.array(test_prediction[0]).ravel()
print(test_prediction_flat.shape)

(20000,)
(20000,)


In [34]:
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error

# Calculate MSE
print('Mean squared error on the test set is: %.5f'
    % mean_squared_error(test_y, test_prediction_flat))

Mean squared error on the test set is: 0.31471


The MSE result 0.315 on the unseen 10% test set looks promising comparing to the baseline, next predict on the given testset from the Kaggle competition for submission

SUBMISSION SET PREDICTION

In [14]:
#load submission text, converted from test.json
submission_set= pd.read_csv(os.path.join(projectFolder, "test_converted.csv"))

#Combine text in several columns

submission_set['reviewText']=submission_set['category'] +" " + submission_set['summary']+" " + submission_set['reviewerID'] + ' '+ submission_set['itemID'] +" "+ submission_set['reviewText']

#fill empty review text with string"ok". I assume consumer  who don't write any specific reviews are generally happy with the purchase.
ok = "ok"
#there were 4 empty review text in total
submission_set.fillna(value=ok, inplace=True)
#submission_set.head(20)
print("submission set has total samples of :", submission_set.shape[0])


submission set has total samples of : 10000


In [15]:
# Apply BERT Tokenizer to submission set data

# BERT Max Seq. Length
SEQ_LEN = 128

# Tokenize the input data
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-cased")

input_ids2 = []
attention_masks2 = []

#assign ID in for loops
for (idx, row) in submission_set.iterrows():
  if row.reviewText != "":
    sample2 = tokenizer(row.reviewText, max_length=SEQ_LEN, padding='max_length', truncation=True)
    input_ids2.append(sample2['input_ids'])
    attention_masks2.append(sample2['attention_mask'])


len(input_ids2), len(attention_masks2)

input_ids2 = np.asarray(input_ids2)
attention_masks2 = np.array(attention_masks2)

input_ids2, attention_masks2

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




(array([[  101,  7312, 19420, ...,     0,     0,     0],
        [  101,  7312,  6082, ...,  1212,  1142,   102],
        [  101,  7312, 25764, ...,     0,     0,     0],
        ...,
        [  101,  5990, 27652, ...,     0,     0,     0],
        [  101, 13069,  2977, ...,     0,     0,     0],
        [  101,  7312,  4368, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]))

In [16]:
#Used trained BERT model to predict on submission set
submission= bert_model.predict([input_ids2, attention_masks2], batch_size=16)
print("classfier output: ", submission)

classfier output:  TFSequenceClassifierOutput(loss=None, logits=array([[5.0299253],
       [5.0323496],
       [4.926212 ],
       ...,
       [4.973816 ],
       [2.976825 ],
       [4.459152 ]], dtype=float32), hidden_states=None, attentions=None)


In [17]:
submission_array = np.array(submission[0]).ravel()
print(submission_array.size)
print(submission_array)
submission_array_2d= np.expand_dims(submission_array,axis=1)
print(submission_array_2d)

10000
[5.0299253 5.0323496 4.926212  ... 4.973816  2.976825  4.459152 ]
[[5.0299253]
 [5.0323496]
 [4.926212 ]
 ...
 [4.973816 ]
 [2.976825 ]
 [4.459152 ]]


In [None]:
#Cap all output to be within 5. 
for i in range(submission_array_2d.shape[0]): 
  if submission_array_2d[i,0]>5:
    submission_array_2d[i,0]=5
    print(submission_array_2d)

In [23]:
#load rating pairs.csv , append output to prediction column
rating_pairs= pd.read_csv(os.path.join(projectFolder, "rating_pairs.csv"))
rating_pairs['prediction']=submission_array_2d
rating_pairs.to_csv(os.path.join(projectFolder,'submission_reg_2epochs_colab_final.csv'))
rating_pairs.head(20)

Unnamed: 0,userID-itemID,prediction
0,u32476110-p76243483,5.0
1,u36732410-p92485419,5.0
2,u85385007-p40031588,4.926212
3,u30715529-p88719785,5.0
4,u95909892-p59188380,5.0
5,u35702954-p61764300,4.28299
6,u90942104-p81970157,4.936156
7,u06812921-p62266491,4.388901
8,u81370492-p59452155,4.860916
9,u54243860-p80964750,5.0


In [24]:
# output result in csv to submit on kaggle
rating_pairs.to_csv('submission_reg_2epochs_colab_final.csv')
from google.colab import files
files.download("submission_reg_2epochs_colab_final.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**The final submission result on Kaggle has a MSE score of 0.30875, ranking top5 on the leaderboard. The BERT model provides state of the art performance for the review preidcition task.**