## Part Three: Fine-tuning a BERT neural network model based on the Transformer architecture

In [1]:
import os
import torch
import transformers
import platform
import psutil
import datetime
import socket

config_summary = {
    "Timestamp": datetime.datetime.now().isoformat(),
    "Hostname": socket.gethostname(),
    "OS": platform.platform(),
    "Python Version": platform.python_version(),
    "Torch Version": torch.__version__,
    "CUDA Available": torch.cuda.is_available(),
    "CUDA Version": torch.version.cuda if torch.cuda.is_available() else "N/A",
    "Device Count": torch.cuda.device_count(),
    "Current Device": torch.cuda.current_device() if torch.cuda.is_available() else "N/A",
    "Device Name": torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "N/A",
    "Transformers Version": transformers.__version__,
    "Total CPU Cores": psutil.cpu_count(logical=False),
    "Total RAM (GB)": round(psutil.virtual_memory().total / 1e9, 2),
}

print("Training Environment Summary\n" + "-"*40)
for k, v in config_summary.items():
    print(f"{k:>30}: {v}")

Training Environment Summary
----------------------------------------
                     Timestamp: 2025-04-23T19:19:50.537793
                      Hostname: autodl-container-ddce40b55a-be393d15
                            OS: Linux-5.15.0-94-generic-x86_64-with-glibc2.35
                Python Version: 3.12.3
                 Torch Version: 2.5.1+cu124
                CUDA Available: True
                  CUDA Version: 12.4
                  Device Count: 2
                Current Device: 0
                   Device Name: NVIDIA GeForce RTX 4090 D
          Transformers Version: 4.51.3
               Total CPU Cores: 96
                Total RAM (GB): 1081.83


In [2]:
import pyarrow.parquet as pq
import seaborn as sns
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
from src import *

2025-04-23 19:19:52.783610: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-23 19:19:52.796687: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745407192.811108    2100 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745407192.815276    2100 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745407192.827097    2100 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [3]:
business_df = pd.read_json('../database/yelp_academic_dataset_business.json', lines=True)
review_df = pd.read_json('../database/yelp_academic_dataset_review.json', lines=True)
restaurants = business_df[business_df['categories'].str.contains('Restaurant', na=False)]
restaurant_reviews = review_df[review_df['business_id'].isin(restaurants['business_id'])]

In [4]:
train_size = int(0.7 * len(restaurant_reviews))
train = restaurant_reviews.iloc[:train_size][['text', 'stars']]
test = restaurant_reviews.iloc[train_size:][['text', 'stars']]

text_train = train['text'].values
labels_train = train['stars'].values.astype(int) - 1
text_test = test['text'].values
labels_test = test['stars'].values.astype(int) - 1

In [5]:
# Initialize the DistilBERT tokenizer with specified configuration
tokenizer = DistilBertTokenizerFast.from_pretrained(
    '../assets/distilbert-base-uncased',   # Use the uncased DistilBERT base model
    do_lower_case=True                     # Convert input text to lowercase
)

text_train = train.text.values
labels_train = train.stars.values
text_test = test.text.values
labels_test = test.stars.values

# Store token IDs
token_id_train = []
token_id_test = []

# Store attention masks
attention_masks_train = []
attention_masks_test = []

In [6]:
# Encode each text sample in the training set
for sample in tqdm(text_train):
    encoding_dict = Bert_preprocess(sample, tokenizer)
    token_id_train.append(encoding_dict['input_ids'])
    attention_masks_train.append(encoding_dict['attention_mask'])

# Concatenate all token IDs and attention masks in the list into single tensors
token_id_train = torch.cat(token_id_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)

100%|██████████| 3307278/3307278 [31:24<00:00, 1755.33it/s] 


In [7]:
# Encode each text sample in the test set
for sample in tqdm(text_test):
    encoding_dict = Bert_preprocess(sample, tokenizer)
    token_id_test.append(encoding_dict['input_ids'])
    attention_masks_test.append(encoding_dict['attention_mask'])

# Concatenate all token IDs and attention masks into single tensors
token_id_test = torch.cat(token_id_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)

100%|██████████| 1417406/1417406 [13:41<00:00, 1724.74it/s]


This step takes a long time, so we save the results now for later use to avoid reprocessing

In [8]:
# Save token IDs and attention masks
torch.save(token_id_train, '../assets/tokenization/train_token_id.pt')
torch.save(attention_masks_train, '../assets/tokenization/train_attention_masks.pt')
torch.save(token_id_test, '../assets/tokenization/test_token_id.pt')
torch.save(attention_masks_test, '../assets/tokenization/test_attention_masks.pt')

In [9]:
# Note: The following code may raise a FutureWarning when loading the .pt file.
# The warning message indicates that torch.load() is being used with the default parameter weights_only=False,
# which enables pickle-based deserialization and may pose a security risk (e.g., loading malicious code).
# This warning is issued by PyTorch to inform users of an upcoming default behavior change, and it does not affect current execution.
# To completely suppress this warning, add the following lines:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [10]:
# # Reload the saved token IDs and attention masks
# token_id_train = torch.load('../assets/tokenization/train_token_id.pt')
# attention_masks_train = torch.load('../assets/tokenization/train_attention_masks.pt')
# token_id_test = torch.load('../assets/tokenization/test_token_id.pt')
# attention_masks_test = torch.load('../assets/tokenization/test_attention_masks.pt')

# # Verify that the data has been loaded correctly
# print("Training Token IDs:", token_id_train.shape)
# print("Training Attention Masks:", attention_masks_train.shape)
# print("Testing Token IDs:", token_id_test.shape)
# print("Testing Attention Masks:", attention_masks_test.shape)

In [11]:
# One-hot encode the labels and convert them to torch float type
labels_train = F.one_hot(torch.tensor(labels_train), num_classes=6).to(torch.float)
labels_test = F.one_hot(torch.tensor(labels_test), num_classes=6).to(torch.float)

In [12]:
# Load the DistilBertForSequenceClassification model
model_distilbert_cls = DistilBertForSequenceClassification.from_pretrained(
    '../assets/distilbert-base-uncased',
    num_labels=6,                # Set the number of output labels to 6
    output_attentions=False,    # Do not output attention weights
    output_hidden_states=False, # Do not output hidden states
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ../assets/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
validation_ratio = 0.2
batch_size = 32

# Split the training data into training and validation sets
train_idx, val_idx = train_test_split(
    np.arange(len(labels_train)),     # Generate an index array equal to the number of training labels
    test_size=validation_ratio,       # Set validation set size to 20%
    shuffle=True,                     # Shuffle the data before splitting
    stratify=labels_train             # Stratify by labels to maintain class distribution
)

# Create TensorDatasets for training and validation sets
train_set = TensorDataset(token_id_train[train_idx],
                          attention_masks_train[train_idx],
                          labels_train[train_idx])

validate_set = TensorDataset(token_id_train[val_idx],
                             attention_masks_train[val_idx],
                             labels_train[val_idx])

test_set = TensorDataset(token_id_test,
                         attention_masks_test,
                         labels_test)


train_dataloader = DataLoader(
    train_set,
    sampler=RandomSampler(train_set),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    validate_set,
    sampler=SequentialSampler(validate_set),
    batch_size=batch_size
)

test_dataloader = DataLoader(
    test_set,
    sampler=RandomSampler(test_set),
    batch_size=batch_size
)

In [14]:
# Configure the optimizer
optimizer = torch.optim.AdamW(model_distilbert_cls.parameters(), 
                              lr=2e-5,         # Learning rate
                              eps=1e-08)       # Epsilon for numerical stability

# Run on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_distilbert_cls = model_distilbert_cls.to(device)

# Recommended number of training epochs according to the paper: 2, 3, or 4
epochs = 4

In [15]:
for _ in trange(epochs, desc='Epoch'):

    # ========== Training Phase ==========

    # Set the model to training mode
    model_distilbert_cls.train()

    # Tracking variables
    train_loss = 0
    nb_train_examples, nb_train_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()

        # Forward pass
        outputs = model_distilbert_cls(b_input_ids,
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
        loss = outputs['loss']

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_distilbert_cls.parameters(), 1.0)
        optimizer.step()

        # Update tracking variables
        train_loss += loss.item()
        nb_train_examples += b_input_ids.size(0)
        nb_train_steps += 1

    # ========== Validation Phase ==========

    # Set the model to evaluation mode
    model_distilbert_cls.eval()

    # Tracking variables
    val_accuracy = []
    eval_loss = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            # Forward pass
            outputs = model_distilbert_cls(b_input_ids,
                                        attention_mask=b_input_mask,
                                        labels=b_labels)
        loss = outputs['loss']
        logits = outputs['logits']

        eval_loss.append(loss.item())
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Compute validation metrics
        batch_accuracy = Bert_compute_batch_accuracy(logits, label_ids)
        val_accuracy.append(batch_accuracy)

    train_loss_avg = train_loss / nb_train_steps
    val_acc_avg = sum(val_accuracy) / len(val_accuracy)

    print('\n\t - Training Loss: {:.4f}'.format(train_loss_avg))
    print('\t - Validation Accuracy: {:.4f}'.format(val_acc_avg))

    with open('training_log.txt', 'a') as f:
        f.write('Epoch {}:\n'.format(_ + 1))
        f.write('\t - Training Loss: {:.4f}\n'.format(train_loss_avg))
        f.write('\t - Validation Accuracy: {:.4f}\n\n'.format(val_acc_avg))

Epoch:  25%|██▌       | 1/4 [3:56:12<11:48:36, 14172.03s/it]


	 - Training Loss: 0.1876
	 - Validation Accuracy: 0.7454


Epoch:  50%|█████     | 2/4 [7:52:35<7:52:37, 14178.54s/it] 


	 - Training Loss: 0.1746
	 - Validation Accuracy: 0.7516


Epoch:  75%|███████▌  | 3/4 [11:48:45<3:56:14, 14174.77s/it]


	 - Training Loss: 0.1672
	 - Validation Accuracy: 0.7501


Epoch: 100%|██████████| 4/4 [15:44:58<00:00, 14174.51s/it]  


	 - Training Loss: 0.1593
	 - Validation Accuracy: 0.7495





<table border="1" style="text-align: center; border-collapse: collapse;">
  <thead>
    <tr>
      <th>Model Type</th>
      <th>Number of Samples</th>
      <th>Tokens</th>
      <th>Epochs</th>
      <th>Learning Rate</th>
      <th>Optimizer Decay</th>
      <th>Training Loss</th>
      <th>Validation Accuracy</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Basic</td>
      <td>50,000</td>
      <td>256</td>
      <td>3</td>
      <td>5e-5</td>
      <td>1e-08</td>
      <td>0.2157</td>
      <td>0.6926</td>
    </tr>
    <tr>
      <td>Basic</td>
      <td>100,000</td>
      <td>256</td>
      <td>3</td>
      <td>5e-5</td>
      <td>1e-08</td>
      <td>0.2251</td>
      <td>0.6970</td>
    </tr>
    <tr>
      <td>Basic</td>
      <td>100,000</td>
      <td>256</td>
      <td>2</td>
      <td>2e-5</td>
      <td>1e-08</td>
      <td>0.2296</td>
      <td>0.7033</td>
    </tr>
    <tr>
      <td>DistilBERT</td>
      <td>50,000</td>
      <td>256</td>
      <td>3</td>
      <td>2e-5</td>
      <td>1e-08</td>
      <td>0.1198</td>
      <td>0.6770</td>
    </tr>
    <tr>
      <td>DistilBERT</td>
      <td>100,000</td>
      <td>256</td>
      <td>3</td>
      <td>2e-5</td>
      <td>1e-08</td>
      <td>0.1949</td>
      <td>0.6957</td>
    </tr>
    <tr>
      <td>DistilBERT</td>
      <td>100,000</td>
      <td>512</td>
      <td>3</td>
      <td>2e-5</td>
      <td>1e-08</td>
      <td>0.1936</td>
      <td>0.7076</td>
    </tr>
    <tr style="font-weight: bold;">
      <td>DistilBERT</td>
      <td>200,000</td>
      <td>512</td>
      <td>4</td>
      <td>2e-5</td>
      <td>1e-08</td>
      <td>0.1616</td>
      <td>0.7117</td>
    </tr>
  </tbody>
</table>


In [16]:
# Save model parameters after completing all training epochs
torch.save(model_distilbert_cls.state_dict(), '../assets/weights/model_distilbert_cls.pth')

# Save the entire model architecture and weights
torch.save(model_distilbert_cls, '../assets/weights/model_distilbert_cls_full.pth')

In [17]:
model_distilbert_cls.eval()

# Initialize tracking variables for evaluation results
total_eval_accuracy = 0
total_eval_loss = 0

# Disable gradient computation
with torch.no_grad():
    for batch in test_dataloader:
        b_input_ids, b_attention_masks, b_labels = tuple(t.to(device) for t in batch)

        # Forward pass
        outputs = model_distilbert_cls(b_input_ids, 
                                    attention_mask=b_attention_masks, 
                                    labels=b_labels)
        
        # Compute loss
        loss = outputs['loss']
        total_eval_loss += loss.item()

        # Get predicted logits
        logits = outputs['logits']
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Update accuracy tracking
        total_eval_accuracy += Bert_compute_batch_accuracy(logits, label_ids)

avg_test_loss = total_eval_loss / len(test_dataloader)
avg_test_accuracy = total_eval_accuracy / len(test_dataloader)

print(f"Test Loss: {avg_test_loss}")
print(f"Test Accuracy: {avg_test_accuracy}")

Test Loss: 0.18068653614668875
Test Accuracy: 0.7500254925422555
