In [2]:
import pandas as pd
import numpy as np
import gc
import pickle
import time
import re
from collections import Counter
from argparse import Namespace
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import BertConfig, BertModel, BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup#, DataCollatorForLanguageModeling
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset, random_split
from torch.nn import functional as F
from tqdm import trange
import datetime
from sklearn.metrics import matthews_corrcoef
import umap
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Load and prepare training data

In [2]:
# Load preprocessed Data
preprocessed_text = pickle.load(open("data/preprocessed_text.pkl", "rb"))

In [24]:
preprocessed_text.head()

Unnamed: 0,stars,text,classes,text length
909689,2,love denny ihop sherry big cookie cutter break...,negative,343
338441,1,always one favorite sandwich shops visit today...,negative,328
637970,2,would nice movie theatre restrooms unisex anyo...,negative,96
630528,1,absolutely done ordering restaurant ever lived...,negative,475
238078,2,wait staff great management delightful came dr...,negative,217


In [3]:
encode_classes_dict = {
    'positive' : 2,
    'neutral' : 1,
    'negative' : 0 }

preprocessed_text['encoded'] = [encode_classes_dict[x] for x in preprocessed_text.classes]

In [4]:
# Create a fixed train and test split
reviews_train, reviews_test, y_train, y_test = train_test_split(preprocessed_text.text, preprocessed_text.encoded,
                                                                test_size = 0.2, random_state = 42,
                                                                stratify=preprocessed_text.encoded)

In [5]:
# Divide the data set into partitions to avoid a timeout

#reviews_train = reviews_train[0:12000]
#reviews_train = reviews_train[12000:24000]
#reviews_train = reviews_train[24000:36000]
reviews_train = reviews_train[36000:48000]

In [6]:
# Do the same with the labels
#y_train = y_train[0:12000]
#y_train = y_train[12000:24000]
#y_train = y_train[24000:36000]
y_train = y_train[36000:48000]

In [7]:
# Encode the labels from 1-5 to 0-4
text = reviews_train.to_list()
labels = y_train.to_list()

# Load the tokenizer and process the data

In [8]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for review in text:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        review,                        # Sentence to encode.
                        add_special_tokens = True,     # Add '[CLS]' and '[SEP]'
                        padding = 'max_length',        # Pad & truncate all sentences.
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,  # Construct attn. masks.
                        return_tensors = 'pt')         # Return pytorch tensors.
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', text[0])
print('Token IDs: ', input_ids[0])

Original:  great late night happy hour menu late night menu pm weekdays albeit tad small menu oysters buck shuck best quality sometimes get avocado salad simple delicious
Token IDs: tensor([  101,  2307,  2397,  2305,  3407,  3178, 12183,  2397,  2305, 12183,
         7610, 19759, 12167, 18819,  2235, 12183, 21480,  2015, 10131, 18454,
         3600,  2190,  3737,  2823,  2131, 20704, 24755,  3527, 16521,  3722,
        12090,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,   

In [10]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

10,800 training samples
1,200 validation samples


In [11]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# Load the model and prepare for training

In [7]:
# Load BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = 3,
                                                      output_attentions = True)

#model = BertForSequenceClassification.from_pretrained('./output_finetuned/',
 #                                                     num_labels = 3,
  #                                                    output_attentions = True)     # Whether the model returns attentions weights.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [1]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

In [19]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, Devlin also had 2e-5
                  eps = 1e-6 # args.adam_epsilon  - set to default because 1e-8 took veeery long.
                )

In [20]:
gc.collect()

415

In [21]:
# Number of training epochs. The BERT authors recommend between 2 and 4. 
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,             # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Train the model

In [42]:
layers = []
device = torch.device("cpu")


# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

    # Training
  
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
  
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) 
        # Backward pass
        loss[0].backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
    
        # Update tracking variables
        tr_loss += loss[0].item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            # print(sum(list(model.parameters())[190:][:1][:1][:1][:1])
     
        # Move logits and labels to CPU
        logits = logits[0].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.588683336465447


Epoch:  33%|███▎      | 1/3 [7:07:28<14:14:57, 25648.68s/it]

Validation Accuracy: 0.7575
Train loss: 0.42923268855721863


Epoch:  67%|██████▋   | 2/3 [14:12:18<7:06:40, 25600.92s/it]

Validation Accuracy: 0.7391666666666666
Train loss: 0.2549794232569359


Epoch: 100%|██████████| 3/3 [21:17:14<00:00, 25544.87s/it]  

Validation Accuracy: 0.7475





In [3]:
gc.collect()

68198

# Evaluate the model on test set

In [8]:
# Tokenize all of the test reviews and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for review in reviews_test.to_list():
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        review,                        # Sentence to encode.
                        add_special_tokens = True,     # Add '[CLS]' and '[SEP]'
                        padding = 'max_length',        # Pad & truncate all sentences.
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,  # Construct attn. masks.
                        return_tensors = 'pt')         # Return pytorch tensors.
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_test.to_list())

In [9]:
test_dataset = TensorDataset(input_ids, attention_masks, labels)

In [11]:
# Prepare data loader for the test data
batch_size = 16

test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

In [12]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
device = torch.device("cpu")

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
  
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)
  
    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
  
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 12,000 test sentences...
    DONE.


In [None]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [15]:
# Check on performance
acc = accuracy_score(flat_true_labels, flat_predictions)
report = classification_report(flat_true_labels, flat_predictions)
print("The model's accuracy on the test set is: {0} \n" .format(acc))
print(report)

# The base model performance on 12K test reviews: 0.33
# The fine-tuned model performance on 12K test reviews: 0.859

The model's accuracy on the test set is: 0.33258333333333334 

              precision    recall  f1-score   support

           0       0.27      0.01      0.02      4000
           1       0.33      0.99      0.50      4000
           2       0.00      0.00      0.00      4000

    accuracy                           0.33     12000
   macro avg       0.20      0.33      0.17     12000
weighted avg       0.20      0.33      0.17     12000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Save Model

In [21]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './output_finetuned/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./output_finetuned/


('./output_finetuned/tokenizer_config.json',
 './output_finetuned/special_tokens_map.json',
 './output_finetuned/vocab.txt',
 './output_finetuned/added_tokens.json')

# Retrieve attention values

## from fine-tuned Bert

In [17]:
# Retrieve attention values for each class of the test set and concatenate them into a dictionary
output_dir = './output_finetuned/'

# Count the number of words in the longest review and save it to max_length
wordcountList = [len(re.sub("[^\w]", " ",  review).split()) for review in reviews_test]
max_length = max(wordcountList)

# Load specific model
Bertfine = BertForSequenceClassification.from_pretrained(output_dir)

test_df = pd.concat([reviews_test, y_test], axis=1)
finetuned_dict = {}
for cond in [2, 1, 0]:
    
    # slice the data 
    test_cond = test_df[test_df["encoded"] == cond]
    
    # tokenize the text and retrieve the attentions
    inputs = tokenizer.encode_plus(test_cond.text.to_list(),         # Sentence to encode.
                                   add_special_tokens = True,        # Add '[CLS]' and '[SEP]'
                                   padding = 'longest',           # Pad & truncate all sentences.
                                   max_length = max_length,
                                   truncation = True,                # truncate sample if too long
                                   pad_to_max_length = True,         # add padding tokens if shorter sequence
                                   return_attention_mask = True,     # Construct attn. masks.
                                   return_tensors = 'pt')            # return pytorch tensors
    outputs = Bertfine(**inputs, output_attentions=True)
    attentions_fine_cond = outputs.attentions
    
    # add it to the dictionary
    finetuned_dict["{0}".format(cond)] = attentions_fine_cond

In [18]:
for i in range(12):
    print("Layer",i+1,":",finetuned_dict['0'][i].size())
# [batch_size, num_heads, sequence_length, sequence_length]

Layer 1 : torch.Size([1, 12, 148, 148])
Layer 2 : torch.Size([1, 12, 148, 148])
Layer 3 : torch.Size([1, 12, 148, 148])
Layer 4 : torch.Size([1, 12, 148, 148])
Layer 5 : torch.Size([1, 12, 148, 148])
Layer 6 : torch.Size([1, 12, 148, 148])
Layer 7 : torch.Size([1, 12, 148, 148])
Layer 8 : torch.Size([1, 12, 148, 148])
Layer 9 : torch.Size([1, 12, 148, 148])
Layer 10 : torch.Size([1, 12, 148, 148])
Layer 11 : torch.Size([1, 12, 148, 148])
Layer 12 : torch.Size([1, 12, 148, 148])


In [19]:
# Save attention values to pkl file
pickle.dump(finetuned_dict, open('attention_values/attentions_Bertfine.pkl', 'wb'))