# Part 3 BERT-BASE-UNCASED

In [1]:
import pandas as pd
import gzip
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer, BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding
import torch
from torch.nn import CrossEntropyLoss
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score
import bert_score
import psutil

2024-05-26 00:15:06.173580: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-26 00:15:06.173622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-26 00:15:06.174953: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-26 00:15:06.183563: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 1. Loading and Preprocessing the Kindle Store Reviews Data

I chose the dataset of Steam game reviews from https://nijianmo.github.io/amazon/index.html

In [2]:
def read_json_gz_to_df(file_path, chunk_size=1000):
    chunks = []
    with gzip.open(file_path, 'rt') as f:
        while True:
            lines = []
            for _ in range(chunk_size):
                line = f.readline()
                if not line:
                    break
                lines.append(json.loads(line))
            if not lines:
                break
            chunk_df = pd.DataFrame(lines)
            chunks.append(chunk_df)
            # For demonstration purposes, you might want to stop after the first chunk
            break  # Remove this line to process the entire file
    # Combine all chunks into a single DataFrame
    full_df = pd.concat(chunks, ignore_index=True)
    return full_df

file_path = 'Kindle_Store_5.json.gz'
df = read_json_gz_to_df(file_path, chunk_size=1000)
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"07 3, 2014",A2LSKD2H9U8N0J,B000FA5KK0,{'Format:': ' Kindle Edition'},sandra sue marsolek,"pretty good story, a little exaggerated, but I...",pretty good story,1404345600,,
1,5.0,True,"05 26, 2014",A2QP13XTJND1QS,B000FA5KK0,{'Format:': ' Kindle Edition'},Tpl,"If you've read other max brand westerns, you k...",A very good book,1401062400,,
2,5.0,True,"09 16, 2016",A8WQ7MAG3HFOZ,B000FA5KK0,{'Format:': ' Kindle Edition'},Alverne F. Anderson,"Love Max, always a fun twist",Five Stars,1473984000,,
3,5.0,True,"03 3, 2016",A1E0MODSRYP7O,B000FA5KK0,{'Format:': ' Kindle Edition'},Jeff,"As usual for him, a good book",a good,1456963200,,
4,5.0,True,"09 10, 2015",AYUTCGVSM1H7T,B000FA5KK0,{'Format:': ' Kindle Edition'},DEHS - EddyRapcon,MB is one of the original western writers and ...,A Western,1441843200,2,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,5.0,True,"02 17, 2016",A3F3AZE7JUPH0,B000JMKRTI,,Judith A. Hughes,I have enjoyed all the Sisterhood books and th...,The Jury review,1455667200,,
996,2.0,True,"02 1, 2016",A2XKPFGMS7ZV12,B000JMKRTI,,Mizchiz,I have read a couple other books in this serie...,Waste of time,1454284800,,
997,5.0,True,"01 17, 2016",AJGHC3AX9LSVX,B000JMKRTI,,Terry Lee,Fast-paced and well-written. I have some catch...,Entertaining,1452988800,,
998,5.0,True,"01 16, 2016",A1PDP4GAOT43B0,B000JMKRTI,,Kindle Customer,Enjoyed re reading.,Five Stars,1452902400,,


# 3. Preparing the Data for BERT

We select only the text column and the target column which is overall

In [3]:
df = df[['overall', 'reviewText']]
df= df.rename(columns={'overall': 'label','reviewText':'text'})
df

Unnamed: 0,label,text
0,4.0,"pretty good story, a little exaggerated, but I..."
1,5.0,"If you've read other max brand westerns, you k..."
2,5.0,"Love Max, always a fun twist"
3,5.0,"As usual for him, a good book"
4,5.0,MB is one of the original western writers and ...
...,...,...
995,5.0,I have enjoyed all the Sisterhood books and th...
996,2.0,I have read a couple other books in this serie...
997,5.0,Fast-paced and well-written. I have some catch...
998,5.0,Enjoyed re reading.


In [4]:
df['label'].nunique()

5

### As this is a multiclass classification problem, we will transfomr our target into a binary class, being positif and negatif reviews

In [5]:
# Encode labels in a single line using a lambda function
df['label'] = df['label'].apply(lambda x: 1 if x > 3 else 0)

# Drop rows with None labels (if handling 3-star reviews by discarding them)
df = df.dropna(subset=['label'])

In [6]:
df.head(10)

Unnamed: 0,label,text
0,1,"pretty good story, a little exaggerated, but I..."
1,1,"If you've read other max brand westerns, you k..."
2,1,"Love Max, always a fun twist"
3,1,"As usual for him, a good book"
4,1,MB is one of the original western writers and ...
5,1,great book
6,0,"A good, solid Western - yes, a little contrive..."
7,1,ALMOST BEEN TOO LONG SINCE I READ IT. GOOD REA...
8,1,Enjoyed this book and will read more from this...
9,1,A very good read for you Western fans. Great ...


In [7]:
device = 'cpu'
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
X = list(df['text'])
y = list(df['label'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_tokenized = tokenizer(X_train, padding='max_length',truncation = True, max_length= 512)
X_val_tokenized = tokenizer(X_val, padding='max_length', truncation = True,max_length = 512)

In [9]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [10]:
len(X_train), len(X_val)

(800, 200)

In [11]:
sample =["Love Max, always a fun twist","As usual for him, a good book"]
tokenizer(sample, padding='max_length', truncation= True, max_length=512)

{'input_ids': [[101, 2293, 4098, 1010, 2467, 1037, 4569, 9792, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Create Torch dataset

In [12]:
class ReviewData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels = None):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
        
    def __len__(self):
        return len(self.encodings['input_ids'])

In [13]:
train_dataset = ReviewData(X_train_tokenized, y_train)
val_dataset = ReviewData(X_val_tokenized, y_val)

In [14]:
train_dataset[5]

{'input_ids': tensor([  101,  2307,  2338,  1010,  2204,  3185,  1012,  2004, 24826, 15683,
          1010,  2007,  2035,  1996, 12225,  1997,  1996,  3119,  1012, 16755,
          2023,  2338,  2005,  2035,  1012,  9483,  1998,  5959,   999,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

# Define our evaluation function with metrics

In [15]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    # Calculate BLEU score
    smoothing_function = SmoothingFunction().method1
    references = [[str(label)] for label in labels]
    candidates = [str(pred_label) for pred_label in pred]
    bleu_scores = [sentence_bleu([ref], cand, smoothing_function=smoothing_function) for ref, cand in zip(references, candidates)]
    avg_bleu = np.mean(bleu_scores)

    # Calculate BERTScore
    P, R, F1 = bert_score.score(candidates, references, lang="en", rescale_with_baseline=True)
    avg_bert_precision = P.mean().item()
    avg_bert_recall = R.mean().item()
    avg_bert_f1 = F1.mean().item()

    return {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "f1": f1,
        "BLEU": avg_bleu,
        "BERTScore_Precision": avg_bert_precision,
        "BERTScore_Recall": avg_bert_recall,
        "BERTScore_f1": avg_bert_f1
    }

In [16]:
# Define your loss function for classification which cross-entropy is common for
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
loss_fn = CrossEntropyLoss()
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs['labels'] 
    outputs = model(**inputs)
    loss = loss_fn(outputs.logits, labels)
    return loss

### We define the parameters according to my own machine and available memory and cpu to train and evaluate the BERT model

In [17]:
# Determine available memory
available_memory = psutil.virtual_memory().available / (1024 ** 3)  # Convert bytes to GB
# Calculate a reasonable batch size (this is an estimation, adjust based on actual usage)
# Here, we use a conservative estimate of 1GB per batch element as a starting point
memory_per_element = 1.0  # GB
initial_batch_size = int(available_memory // memory_per_element // 2)  # Divide by 2 for safety margin

Trainer
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate = 1e-4,
    num_train_epochs = 3,
    logging_steps = 10,
    load_best_model_at_end = False,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    remove_unused_columns = False,
    push_to_hub= False,
    logging_dir="./logs",
)



In [22]:
trainer = Trainer(
    model = model,
    args= args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

# Save the model trained to reuse it without having to train again, we commented these cells to avoid retraining

In [19]:
# trainer.train()
# model.save_pretrained('./saved_model_2')
# tokenizer.save_pretrained('./saved_model_2')

In [26]:
tokenizer = BertTokenizer.from_pretrained('./saved_model_2')
model = BertForSequenceClassification.from_pretrained('./saved_model_2')

In [27]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6012194752693176,
 'eval_Accuracy': 0.715,
 'eval_Precision': 0.5112249999999999,
 'eval_Recall': 0.715,
 'eval_f1': 0.5961807580174928,
 'eval_BLEU': 0.12714697781778297,
 'eval_BERTScore_Precision': 0.9667827486991882,
 'eval_BERTScore_Recall': 0.9667849540710449,
 'eval_BERTScore_f1': 0.9668379426002502,
 'eval_runtime': 87.6283,
 'eval_samples_per_second': 2.282,
 'eval_steps_per_second': 0.571}