In [1]:
import pandas as pd
import numpy as np
import gc
import pickle
import time
import random
import re
from collections import Counter
from pprint import pprint
from argparse import Namespace
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertConfig, BertForMaskedLM, BertTokenizer
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
from transformers import pipeline
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F

# Load and prepare train data

In [2]:
# Load preprocessed Data
preprocessed_text = pickle.load(open("data/preprocessed_text.pkl", "rb"))

In [3]:
random.seed(42)
np.random.seed(42)

In [4]:
# Create a fixed train and test split
reviews_train, reviews_test, y_train, y_test = train_test_split(preprocessed_text.text, preprocessed_text.classes,
                                                                test_size = 0.2, random_state = 42,
                                                                stratify=preprocessed_text.classes)

In [5]:
# Divide the reviews into two partitions for training to avoid timeout

#reviews_train = reviews_train[0:24000]
reviews_train = reviews_train[24000:48000]

In [6]:
# Save the train set to a .txt file
np.savetxt(fname='data/train.txt', X=np.array(reviews_train.values.tolist()), fmt='%s')

In [7]:
gc.collect()

12

# Prepare tokenizer and BERT model

In [8]:
# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, padding='max_length')

In [9]:
%%time

'''
transformers has a predefined class LineByLineTextDataset()
which reads your text line by line and converts them to tokens
'''

dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = 'data/train.txt',
    block_size = 512    # maximum sequence length
)

print('No. of lines: ', len(dataset)) # No of lines in your datset



No. of lines:  24000
CPU times: user 50.2 s, sys: 215 ms, total: 50.4 s
Wall time: 50.4 s


In [10]:
#Use the pre-trained weights instead of custom config like in the source

model = BertForMaskedLM.from_pretrained('output_pretrained/',
                                        output_attentions = True)      # Whether the model returns attentions weights.

print('No of parameters: ', model.num_parameters())


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

No of parameters:  109514298


# Train the model

In [11]:
batch_size = 32
epochs = 3

In [14]:
training_args = TrainingArguments(
    output_dir='output_pretrained/',
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    learning_rate=1e-4,
    save_steps=10_000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [15]:
%%time

trainer.train()
trainer.save_model('output_pretrained/')

Step,Training Loss
500,4.5405
1000,4.5547
1500,4.4101
2000,4.3116


CPU times: user 3d 20h 44min 50s, sys: 3h 18min, total: 4d 2min 50s
Wall time: 14h 31min 54s


TrainOutput(global_step=2250, training_loss=4.432013997395833, metrics={'train_runtime': 52314.9608, 'train_samples_per_second': 1.376, 'train_steps_per_second': 0.043, 'total_flos': 7118898967309824.0, 'epoch': 3.0})

# Check performance compared to base model

In [6]:
#PoC
#Load both models and pipeline for comparison

#Base BERT 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True)
base_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

fill_mask2 = pipeline(
    "fill-mask",
    model=base_model,
    tokenizer=tokenizer
)

fill_mask2('This italian restaurant had the best [MASK] I have ever eaten.')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.5289820432662964,
  'sequence': 'this italian restaurant had the best food i have ever eaten.',
  'token': 2833,
  'token_str': 'f o o d'},
 {'score': 0.048788491636514664,
  'sequence': 'this italian restaurant had the best pizza i have ever eaten.',
  'token': 10733,
  'token_str': 'p i z z a'},
 {'score': 0.04705256223678589,
  'sequence': 'this italian restaurant had the best meal i have ever eaten.',
  'token': 7954,
  'token_str': 'm e a l'},
 {'score': 0.0386321023106575,
  'sequence': 'this italian restaurant had the best menu i have ever eaten.',
  'token': 12183,
  'token_str': 'm e n u'},
 {'score': 0.032607030123472214,
  'sequence': 'this italian restaurant had the best dinner i have ever eaten.',
  'token': 4596,
  'token_str': 'd i n n e r'}]

In [7]:
# Pretrained BERT

# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True)

# Load specific model
model_pretrained = BertForMaskedLM.from_pretrained('output_pretrained/')

fill_mask = pipeline(
    "fill-mask",
    model=model_pretrained,
    tokenizer=tokenizer
)

fill_mask('This italian restaurant had the best [MASK] I have ever eaten.')

# Service and Pizza gained much higher score after training

[{'score': 0.2478836625814438,
  'sequence': 'this italian restaurant had the best food i have ever eaten.',
  'token': 2833,
  'token_str': 'f o o d'},
 {'score': 0.09602867066860199,
  'sequence': 'this italian restaurant had the best service i have ever eaten.',
  'token': 2326,
  'token_str': 's e r v i c e'},
 {'score': 0.08484292775392532,
  'sequence': 'this italian restaurant had the best pizza i have ever eaten.',
  'token': 10733,
  'token_str': 'p i z z a'},
 {'score': 0.056329403072595596,
  'sequence': 'this italian restaurant had the best experience i have ever eaten.',
  'token': 3325,
  'token_str': 'e x p e r i e n c e'},
 {'score': 0.0265803299844265,
  'sequence': 'this italian restaurant had the best meal i have ever eaten.',
  'token': 7954,
  'token_str': 'm e a l'}]

In [95]:
# Check performance of pre-trained model to base model by randomly masking words in test set
replaced_words = []
sentences = []

for sent in reviews_test:
    words=sent.split(" ")
    n = np.random.randint(low=0, high=len(words))
    replaced_words.append(words[n])
    words[n]="[MASK]"
    masked_sentence = " ".join(words)
    sentences.append(masked_sentence)

In [105]:
df = pd.DataFrame(sentences, columns=['sentence'])
df['word'] = replaced_words
df.head()

Unnamed: 0,sentence,word
0,hip place said food someone whose tastes buds ...,atmosphere
1,ordered place ubereats decided give place try ...,things
2,hold phone someone say cent cupcakes hold need...,cupcakes
3,probably couple hundred times last years remem...,care
4,weekend getaway whistler friends stopped [MASK...,brunch


In [3]:
pred2 = [fill_mask2(sent, top_k=1)[0]['token_str'].replace(" ","") for sent in df.sentence.to_list()]
print('Done with base.')
acc2 = accuracy_score(df.word.to_list(), pred2)
print("The base model's accuracy on the test set is: {0}" .format(acc2))

pred = [fill_mask(sent, top_k=1)[0]['token_str'].replace(" ","") for sent in df.sentence.to_list()]
print('\nDone with pre-trained.')
acc = accuracy_score(df.word.to_list(), pred)
print("The pre-trained model's accuracy on the test set is: {0}" .format(acc))

#The base model's accuracy on the test set is: 0.0496
#The pre-trained model's accuracy on the test set is: 0.161

Done with base.
The base model's accuracy on the test set is: 0.0496

Done with pre-trained.
The pre-trained model's accuracy on the test set is: 0.161


# Retrieve attention values on test set

## from pre-trained model

In [109]:
# Retrieve attention values for each class of the test set and concatenate them into a dictionary

# Count the number of words in the longest review and save it to max_length
wordcountList = [len(re.sub("[^\w]", " ",  review).split()) for review in reviews_test]
max_length = max(wordcountList)

test_df = pd.concat([reviews_test, y_test], axis=1)
pretrained_dict = {}
for cond in ['positive', 'neutral', 'negative']:
    
    # slice the data 
    test_cond = test_df[test_df["classes"] == cond]
    
    # tokenize the text and retrieve the attentions
    inputs = tokenizer.encode_plus(test_cond.text.to_list(),         # Sentence to encode.
                                   add_special_tokens = True,        # Add '[CLS]' and '[SEP]'
                                   padding = 'longest',              # Pad & truncate all sentences.
                                   max_length = max_length,          # Define sequence length as longest sequence of test set
                                   truncation = True,                # truncate sample if too long
                                   pad_to_max_length = True,         # add padding tokens if shorter sequence
                                   return_attention_mask = True,     # Construct attn. masks.
                                   return_tensors = 'pt')            # return pytorch tensors
    outputs = model_pretrained(**inputs, output_attentions=True)
    attentions_pretrained_cond = outputs.attentions
    
    # add it to the dictionary
    pretrained_dict["{0}".format(cond)] = attentions_pretrained_cond

In [110]:
# Show shape of each attention layer tensor
for i in range(12):
    print("Layer",i+1,":",pretrained_dict['negative'][i].size())
# [batch_size, num_heads, sequence_length, sequence_length]

Layer 1 : torch.Size([1, 12, 148, 148])
Layer 2 : torch.Size([1, 12, 148, 148])
Layer 3 : torch.Size([1, 12, 148, 148])
Layer 4 : torch.Size([1, 12, 148, 148])
Layer 5 : torch.Size([1, 12, 148, 148])
Layer 6 : torch.Size([1, 12, 148, 148])
Layer 7 : torch.Size([1, 12, 148, 148])
Layer 8 : torch.Size([1, 12, 148, 148])
Layer 9 : torch.Size([1, 12, 148, 148])
Layer 10 : torch.Size([1, 12, 148, 148])
Layer 11 : torch.Size([1, 12, 148, 148])
Layer 12 : torch.Size([1, 12, 148, 148])


In [111]:
# Save attention values to pkl file
pickle.dump(pretrained_dict, open('attention_values/attentions_pretrained.pkl', 'wb'))

## from Bert base

In [44]:
# Retrieve attention values for each class of the test set and concatenate them into a dictionary

# Count the number of words in the longest review and save it to max_length
wordcountList = [len(re.sub("[^\w]", " ",  review).split()) for review in reviews_test]
max_length = max(wordcountList)

# Load specific model
Bertbase = BertModel.from_pretrained('bert-base-uncased')

test_df = pd.concat([reviews_test, y_test], axis=1)
base_dict = {}
for cond in ['positive', 'neutral', 'negative']:
    
    # slice the data 
    test_cond = test_df[test_df["classes"] == cond]
    
    # tokenize the text and retrieve the attentions
    inputs = tokenizer.encode_plus(test_cond.text.to_list(),         # Sentence to encode.
                                   add_special_tokens = True,        # Add '[CLS]' and '[SEP]'
                                   padding = 'longest',              # Pad & truncate all sentences.
                                   truncation = True,                # truncate sample if too long
                                   max_length = max_length,          # Define sequence length as longest sequence of test set
                                   pad_to_max_length = True,         # add padding tokens if shorter sequence
                                   return_attention_mask = True,     # Construct attn. masks.
                                   return_tensors = 'pt')            # return pytorch tensors
    outputs = Bertbase(**inputs, output_attentions=True)
    attentions_base_cond = outputs.attentions
    
    # add it to the dictionary
    base_dict["{0}".format(cond)] = attentions_base_cond

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
for i in range(12):
    print("Layer",i+1,":",base_dict['negative'][i].size())
# [batch_size, num_heads, sequence_length, sequence_length]

Layer 1 : torch.Size([1, 12, 482, 482])
Layer 2 : torch.Size([1, 12, 482, 482])
Layer 3 : torch.Size([1, 12, 482, 482])
Layer 4 : torch.Size([1, 12, 482, 482])
Layer 5 : torch.Size([1, 12, 482, 482])
Layer 6 : torch.Size([1, 12, 482, 482])
Layer 7 : torch.Size([1, 12, 482, 482])
Layer 8 : torch.Size([1, 12, 482, 482])
Layer 9 : torch.Size([1, 12, 482, 482])
Layer 10 : torch.Size([1, 12, 482, 482])
Layer 11 : torch.Size([1, 12, 482, 482])
Layer 12 : torch.Size([1, 12, 482, 482])


In [51]:
# Save attention values to pkl file
pickle.dump(base_dict, open('attention_values/attentions_Bertbase.pkl', 'wb'))