<a href="https://colab.research.google.com/github/Xelvise/NLP-compilation-with-HuggingFace/blob/main/Fine_tuning_BERT_for_Yelp_review_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Fine Tuning HuggingFace bert-base-uncased for Yelp customer review prediction

[DATASET](https://www.kaggle.com/datasets/omkarsabnis/yelp-reviews-dataset)

In [None]:
# !nvidia-smi    # check GPU configuration
# !ls

In [None]:
!pip install -U transformers tokenizers datasets accelerate -q
from transformers import BertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments

import torch
# from torch.utils.data import Dataset, DataLoader
# import torch.nn.functional as F
# from torch import nn, optim

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib import rc

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

### Read the Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/yelp.csv', usecols=['text','stars'])
df

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...
...,...,...
9995,3,First visit...Had lunch here today - used my G...
9996,4,Should be called house of deliciousness!\n\nI ...
9997,4,I recently visited Olive and Ivy for business ...
9998,2,My nephew just moved to Scottsdale recently so...


In [None]:
df.stars.unique()

array([5, 4, 2, 3, 1])

We'd have to transform the classes in `stars` to **0-based**. This is due to the fact that some loss functions have defined ranges for the possible inputs that can be accepted.

So since we'll be using cross-entropy loss function (as it's a classification task), we adhere to it.

In [None]:
df['stars'] = df['stars'].map({1:0, 2:1, 3:2, 4:3, 5:4})
df

Unnamed: 0,stars,text
0,4,My wife took me here on my birthday for breakf...
1,4,I have no idea why some people give bad review...
2,3,love the gyro plate. Rice is so good and I als...
3,4,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,4,General Manager Scott Petello is a good egg!!!...
...,...,...
9995,2,First visit...Had lunch here today - used my G...
9996,3,Should be called house of deliciousness!\n\nI ...
9997,3,I recently visited Olive and Ivy for business ...
9998,1,My nephew just moved to Scottsdale recently so...


In [None]:
df.stars.unique()

array([4, 3, 1, 2, 0])

In [None]:
# check class distribution

df.stars.value_counts()

stars
3    3526
4    3337
2    1461
1     927
0     749
Name: count, dtype: int64

### Instantiate Bert Tokenizer

In [None]:
checkpoint = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Tokenization and encoding on a sample text

In [None]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

tokens = tokenizer.tokenize(sample_txt)

token_ids = tokenizer.convert_tokens_to_ids(tokens)

# print(f' Sentence: {sample_txt}')
# print(f'   Tokens: {tokens}')
# print(f'Token IDs: {token_ids}')

encoding = tokenizer(
    sample_txt,
    max_length=32,
    truncation=True,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt')  # Return PyTorch tensors)

print(f'Encoding keys: {encoding.keys()}')
print(len(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(len(encoding['attention_mask'][0]))
print(encoding['attention_mask'])
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))

Encoding keys: dict_keys(['input_ids', 'attention_mask'])
17
tensor([ 101, 2043, 2001, 1045, 2197, 2648, 1029, 1045, 2572, 5881, 2012, 2188,
        2005, 1016, 3134, 1012,  102])
17
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
['[CLS]', 'when', 'was', 'i', 'last', 'outside', '?', 'i', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.', '[SEP]']


### Tokenization and encoding on the entire dataset

In [None]:
x = list(df.text)
y = list(df.stars)

# train-test split
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.3, random_state=0, stratify=y)

In [None]:
import datasets
from datasets import DatasetDict, Dataset, Features, Value

# Assuming you have your data in the following format
train_data = {'text': xtrain, 'label': ytrain}
test_data = {'text': xtest, 'label': ytest}

# Define the features of your dataset
features = Features({'text': Value('string'), 'label': Value('int32')})

# Create datasets from your data
train_dataset = Dataset.from_dict(train_data, features=features)
test_dataset = Dataset.from_dict(test_data, features=features)

# Create a DatasetDict
data = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3000
    })
})

In [None]:
# define tokenization method

def tokenize(batch):    # for every data batch (like train, test, validation), this function tokenizes each
    return tokenizer(batch['text'], truncation=True, padding=True, return_tensors='pt', return_token_type_ids=False)     # max_length defaults to length of longest sequence in a batch

encoded_data = data.map(tokenize, batched=True, batch_size=None)     # batched=True allows for parallel tokenization of all the batches
encoded_data
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# encoded_data = data_collator(encoded_data)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3000
    })
})

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")    # Use GPU if present, else use CPU

# Initialize the classication head adding the expected number of labels
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5).to(device)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# visualize model architecture
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Model Fine-tuning

In [None]:
# SequenceClassification is mainly for sentiment analysis or text classification tasks
# define hyperparameters for fine-tuning

training_args = TrainingArguments(
    output_dir="finetuned-multiclass-classifier",
    num_train_epochs=3,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.00001,
    evaluation_strategy='epoch',
    disable_tqdm=False
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
# For every epoch, evaluation is done in which actual and predicted labels is computed

def compute_metrics(pred):
    labels = pred.label_ids     # actual labels
    preds = pred.predictions.argmax(-1)      # predicted label
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy":acc, "f1":f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=encoded_data['train'],
    eval_dataset=encoded_data['test'],
    tokenizer=tokenizer,
#     data_collator=data_collator
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.944149,0.587333,0.57502
2,1.107100,0.890036,0.602333,0.598119


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.944149,0.587333,0.57502
2,1.107100,0.890036,0.602333,0.598119
3,0.821500,0.881207,0.614667,0.611253


TrainOutput(global_step=1314, training_loss=0.9121517024628104, metrics={'train_runtime': 2206.0559, 'train_samples_per_second': 9.519, 'train_steps_per_second': 0.596, 'total_flos': 5525480991744000.0, 'train_loss': 0.9121517024628104, 'epoch': 3.0})

### Model Evaluation

In [None]:
predictions = trainer.predict(encoded_data["test"])
predictions    # predictions in the form of logits

PredictionOutput(predictions=array([[ 2.195528  ,  1.6754305 ,  0.05677799, -1.6351987 , -1.2142477 ],
       [-2.0518472 , -0.7461352 ,  1.6950507 ,  1.7860887 , -0.47478333],
       [-2.4794693 , -1.975168  ,  0.2169665 ,  2.6747668 ,  1.221034  ],
       ...,
       [-2.600763  , -2.0896056 , -0.13722359,  2.5680964 ,  1.9841818 ],
       [-2.6252759 , -2.2312102 ,  0.3894078 ,  2.6576748 ,  1.1361403 ],
       [-2.2862468 , -2.258819  , -1.1146095 ,  2.1814754 ,  3.0840993 ]],
      dtype=float32), label_ids=array([0, 3, 3, ..., 3, 3, 4]), metrics={'test_loss': 0.8812071681022644, 'test_accuracy': 0.6146666666666667, 'test_f1': 0.6112530091101415, 'test_runtime': 99.5104, 'test_samples_per_second': 30.148, 'test_steps_per_second': 1.889})

In [None]:
class_names = ['1-Star', '2-Star', '3-Star', '4-Star', '5-Star']

In [None]:
# applying argmax so as to get the index of the prediction with the highest logit values

preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(y_true=encoded_data['test']['label'], y_pred=preds, target_names=class_names))

              precision    recall  f1-score   support

      1-Star       0.66      0.69      0.68       225
      2-Star       0.50      0.44      0.46       278
      3-Star       0.52      0.42      0.47       438
      4-Star       0.59      0.63      0.61      1058
      5-Star       0.69      0.71      0.70      1001

    accuracy                           0.61      3000
   macro avg       0.59      0.58      0.58      3000
weighted avg       0.61      0.61      0.61      3000



### Model Inference

In [None]:
model.save_pretrained('/saved_models')
tokenizer.save_pretrained('/saved_models')

finetuned_model = AutoModelForSequenceClassification.from_pretrained("/saved_models")
finetuned_model.config.id2label = {list(finetuned_model.config.id2label)[i]: class_names[i] for i in range(len(class_names))}
finetuned_model.config

BertConfig {
  "_name_or_path": "/saved_models",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1-Star",
    "1": "2-Star",
    "2": "3-Star",
    "3": "4-Star",
    "4": "5-Star"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.40.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
from transformers import pipeline
def inference(*statements):
    classifier = pipeline("sentiment-analysis", model=finetuned_model, tokenizer=tokenizer)
    statements = [tokenizer.convert_tokens_to_string(tokenizer.tokenize(sentence, max_length=512-2, truncation=True)) for sentence in statements]
    return classifier(statements)

In [None]:
reviews = ['I could share photos seamlessly with this app',
            'I could share photos seamlessly with this app, but the UI could be be better',
            'This product is wierd. I hate it']

inference(*reviews)

[{'label': '5-Star', 'score': 0.7632673382759094},
 {'label': '3-Star', 'score': 0.5218908786773682},
 {'label': '1-Star', 'score': 0.7605271339416504}]