# Model training: BERT (sentiment)

---

## Table of Contents

1. [Imports](#imports)
2. [Data loading and splitting](#data-loading-and-splitting)
3. [Setting training parameters](#setting-training-parameters)
4. [Model training](#model-training)
5. [Model evaluation](#model-evaluation)
6. [Summary](#summary)
7. [Model serialization](#model-serialization)

# Imports

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.utils.class_weight import compute_class_weight
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from torch.optim import lr_scheduler

# Data loading and splitting

In [2]:
base_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))

train_file = os.path.join(base_dir, 'train_sentiment.csv')
val_file = os.path.join(base_dir, 'val_sentiment.csv')
test_file = os.path.join(base_dir, 'test_sentiment.csv')

if not all([os.path.exists(train_file), os.path.exists(val_file), os.path.exists(test_file)]):
    sentiment_df = pd.read_parquet('../../data/sentiment_without_outliers/sentiment_without_outliers.parquet')
    sentiment_df = sentiment_df.drop(columns=['text_length'])
    
    train_data, temp_data = train_test_split(sentiment_df, test_size=0.3, stratify=sentiment_df['label'], random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

    train_data.to_csv(train_file, index=False)
    val_data.to_csv(val_file, index=False)
    test_data.to_csv(test_file, index=False)
else:
    train_data = pd.read_csv(train_file)
    val_data = pd.read_csv(val_file)
    test_data = pd.read_csv(test_file)

# Setting training parameters

Due to the uneven distribution of classes in the dataset, the classes will be weighted.

In [3]:
class_weights = compute_class_weight('balanced', classes=pd.unique(train_data['label']), y=train_data['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
class LabeledDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.texts = data['text']
        self.labels = data['label']
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [6]:
def create_data_loader(data, tokenizer, max_len, batch_size):
    dataset = LabeledDataset(data, tokenizer, max_len)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
train_loader = create_data_loader(train_data, tokenizer, max_len=256, batch_size=16)
val_loader = create_data_loader(val_data, tokenizer, max_len=256, batch_size=16)
test_loader = create_data_loader(test_data, tokenizer, max_len=256, batch_size=16)

# Model training

In [8]:
device = 'cuda'

In [9]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
loss_fn = CrossEntropyLoss(weight=class_weights.to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)



In [11]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, epoch):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        _, preds = outputs.logits.max(1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if batch_idx % 10 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            accuracy = 100. * correct_predictions / total_samples
            print(f"Epoch: {epoch}. Batch {batch_idx}/{len(data_loader)} - Avg Loss: {avg_loss:.4f} - Accuracy: {accuracy:.2f}%")

    avg_loss = total_loss / len(data_loader)
    accuracy = 100. * correct_predictions / total_samples
    return avg_loss, accuracy

In [12]:
def eval_model(model, data_loader, device):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)

    return correct_predictions.double() / total_predictions

In [13]:
for epoch in range(2):
    train_loss, train_accuracy = train_epoch(model, train_loader, loss_fn, optimizer, device, epoch)
    print(f"Train loss: {train_loss:.4f} - Train accuracy: {train_accuracy:.2f}%")
    val_accuracy = eval_model(model, val_loader, device)
    print(f"Validation accuracy: {val_accuracy:.4f}")

Epoch: 0. Batch 0/4317 - Avg Loss: 1.0575 - Accuracy: 50.00%
Epoch: 0. Batch 10/4317 - Avg Loss: 1.0665 - Accuracy: 36.36%
Epoch: 0. Batch 20/4317 - Avg Loss: 1.0556 - Accuracy: 39.58%
Epoch: 0. Batch 30/4317 - Avg Loss: 1.0669 - Accuracy: 39.52%
Epoch: 0. Batch 40/4317 - Avg Loss: 1.0491 - Accuracy: 42.23%
Epoch: 0. Batch 50/4317 - Avg Loss: 1.0397 - Accuracy: 42.52%
Epoch: 0. Batch 60/4317 - Avg Loss: 1.0302 - Accuracy: 43.85%
Epoch: 0. Batch 70/4317 - Avg Loss: 1.0181 - Accuracy: 45.69%
Epoch: 0. Batch 80/4317 - Avg Loss: 1.0087 - Accuracy: 46.60%
Epoch: 0. Batch 90/4317 - Avg Loss: 0.9880 - Accuracy: 48.08%
Epoch: 0. Batch 100/4317 - Avg Loss: 0.9703 - Accuracy: 49.81%
Epoch: 0. Batch 110/4317 - Avg Loss: 0.9591 - Accuracy: 50.51%
Epoch: 0. Batch 120/4317 - Avg Loss: 0.9480 - Accuracy: 51.19%
Epoch: 0. Batch 130/4317 - Avg Loss: 0.9407 - Accuracy: 52.15%
Epoch: 0. Batch 140/4317 - Avg Loss: 0.9323 - Accuracy: 53.01%
Epoch: 0. Batch 150/4317 - Avg Loss: 0.9277 - Accuracy: 53.31%
Epo

# Model evaluation

In [14]:
test_accuracy = eval_model(model, test_loader, device)
print(f"Test accuracy: {test_accuracy:.4f}")

Test accuracy: 0.7383


# Summary

| Epoch        | Train Accuracy | Validation Accuracy |
|-------------|---------------|---------------------|
| **Epoch 1** | 70.64%        | 73.61%              |
| **Epoch 2** | 78.91%        | 73.69%              |

### Observation
- The **train accuracy** increases.
- The **validation accuracy** remains nearly constant (~73,6%), with a slight **increase**.

# Model serialization

In [None]:
model.save_pretrained('./bert_sentiment_model')
tokenizer.save_pretrained('./bert_sentiment_model')

('./bert_sentiment_model_overfitted\\tokenizer_config.json',
 './bert_sentiment_model_overfitted\\special_tokens_map.json',
 './bert_sentiment_model_overfitted\\vocab.txt',
 './bert_sentiment_model_overfitted\\added_tokens.json')