## Semantic Analysis with BERT fine-tune
#### data source: Amazon Fine Food Reviews
#### Ricardo Flores

In [1]:
import pandas as pd
import numpy as np
import random
import re

from sklearn.model_selection import train_test_split
from sklearn.utils import resample


# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# plots
import seaborn as sns
import matplotlib.pyplot as plt

# BERT (transformers)
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# torch
import torch


In [2]:
# metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # calculate accuracy using sklearn's function
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'Accuracy': acc,
    }

In [3]:
# get data (sampling 1% of original data)
df_review = pd.read_csv('./data/Reviews_1percent.csv')

In [4]:
# Select relevant columns 
df_review = df_review[['Id', 'Score', 'Text']]
df_review

Unnamed: 0,Id,Score,Text
0,127276,5,There is something to be said about this candy...
1,395444,5,"In many languages around the world, ""chai"" sim..."
2,225935,1,"Review of Asian Taste Dried Mushroom, 5-Ounce ..."
3,562268,5,Great flavor. I have always ordered Blue Diamo...
4,491584,3,The product came with a crushed box but was OK...
...,...,...,...
5680,407146,4,For the amount of money spent on this product ...
5681,323379,5,I love this product. It's really a great way t...
5682,455703,5,Wow! This jerky is delicious! I have purchased...
5683,563011,5,The soft baked cookies are amazingly delicious...


In [5]:
# create labels 
label = []
for i in df_review.Score:
    if i > 3:
        label.append(1) # positive 
    else:
        label.append(0) # negative
df_review["Label"] = label
    

In [6]:
# split data
data = df_review['Text']
y = df_review['Label']

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify=y, random_state = 124)

print("Train data:",  X_train.shape, y_train.shape)
print("Test data:",  X_test.shape, y_test.shape)

Train data: (3979,) (3979,)
Test data: (1706,) (1706,)


In [7]:
# Upsampling 
train = pd.DataFrame({'text':X_train, 'label':y_train})

#Count 1s and 0s
ones = len(train.loc[train['label'] == 1])
zeros = len(train.loc[train['label'] == 0])
if ones >= zeros:
    majority = 1
    minority = 0
else:
    majority = 0
    minority = 1
    
# Upsample TrainingSet
train_majority = train[train.label==majority]
train_minority = train[train.label==minority]

# Upsample minority class
train_minority_upsampled = resample(train_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(train_majority),    # to match majority class
                                 random_state=42) # reproducible results

# Combine majority class with upsampled minority class
train = pd.concat([train_majority, train_minority_upsampled])

X_train = train['text']
y_train = train['label']

print("Train data:",  X_train.shape, y_train.shape)
print("Test data:",  X_test.shape, y_test.shape)

Train data: (6192,) (6192,)
Test data: (1706,) (1706,)


## BERT with fine-tune

In [8]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "bert-base-uncased"

# max sequence length for each document/sentence sample
max_length = 256 # 256, 512

In [9]:
# load the tokenizer
#tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
#tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [10]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=max_length)

In [11]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)



In [12]:
# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, y_train.tolist())
valid_dataset = NewsGroupsDataset(valid_encodings, y_test.tolist())

In [13]:
# Model
#model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cuda")
model =  DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to("cuda")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [14]:
# training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=500,             # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=500,         # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [15]:
# training function
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [16]:
# Train the model
trainer.train()  # 

***** Running training *****
  Num examples = 6192
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1548


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.5606,0.441802,0.895415,0.941265,0.917768,0.868699
1000,0.4192,0.436394,0.947697,0.914157,0.930625,0.893904
1500,0.2568,0.460086,0.932442,0.945783,0.939065,0.904455


***** Running Evaluation *****
  Num examples = 1706
  Batch size = 4
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1706
  Batch size = 4
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1706
  Batch size = 4
Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json
Model weights saved in ./results\checkpoint-1500\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-1000 (score: 0.4363941550254822).


TrainOutput(global_step=1548, training_loss=0.4090772180286181, metrics={'train_runtime': 140.5884, 'train_samples_per_second': 44.043, 'train_steps_per_second': 11.011, 'total_flos': 410119066238976.0, 'train_loss': 0.4090772180286181, 'epoch': 1.0})