# Homework 2: Fine-tuning Encoder Models

In this assignment, we will fine-tune three encoder models (BERT) for restaurant review classification.

## 1. Install dependencies

In [1]:
!pip install transformers datasets accelerate scikit-learn torch -q

## 2. Import libraries

In [2]:
import json
import time
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from collections import Counter

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

Using device: cuda
GPU: NVIDIA GeForce RTX 4060 Ti


## 3. Data loading and processing

In [3]:
# Load data
data = []
with open('restaurants_reviews-327545-5892c5.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
print(f'Total samples: {len(df)}')
print(f'\nRating distribution (general):')
print(df['general'].value_counts().sort_index())

Total samples: 47139

Rating distribution (general):
general
0    43940
1      462
2      166
3      150
4      257
5     2164
Name: count, dtype: int64


In [4]:
# Filter for ratings 1, 3, 5
df_filtered = df[df['general'].isin([1, 3, 5])].copy()
print(f'Samples after filtering (ratings 1, 3, 5): {len(df_filtered)}')

# Remap labels: 1 -> 0, 3 -> 1, 5 -> 2
label_mapping = {1: 0, 3: 1, 5: 2}
df_filtered['label'] = df_filtered['general'].map(label_mapping)

print(f'\nLabel distribution after remapping:')
print(df_filtered['label'].value_counts().sort_index())
print(f'\nLabel mapping: 1->0 (negative), 3->1 (neutral), 5->2 (positive)')

Samples after filtering (ratings 1, 3, 5): 2776

Label distribution after remapping:
label
0     462
1     150
2    2164
Name: count, dtype: int64

Label mapping: 1->0 (negative), 3->1 (neutral), 5->2 (positive)


In [5]:
# Split data: train (70%), val (15%), test (15%)
RANDOM_STATE = 42

# First split: train+val (85%) vs test (15%)
train_val_df, test_df = train_test_split(
    df_filtered, 
    test_size=0.15, 
    random_state=RANDOM_STATE,
    stratify=df_filtered['label']
)

# Second split: train (70/85 ≈ 82.35%) vs val (15/85 ≈ 17.65%)
train_df, val_df = train_test_split(
    train_val_df, 
    test_size=0.15/0.85,  # 15% of total = 17.65% of train_val
    random_state=RANDOM_STATE,
    stratify=train_val_df['label']
)

print(f'Train size: {len(train_df)} ({len(train_df)/len(df_filtered)*100:.1f}%)')
print(f'Val size: {len(val_df)} ({len(val_df)/len(df_filtered)*100:.1f}%)')
print(f'Test size: {len(test_df)} ({len(test_df)/len(df_filtered)*100:.1f}%)')

print(f'\nTrain label distribution:')
print(train_df['label'].value_counts().sort_index())

Train size: 1942 (70.0%)
Val size: 417 (15.0%)
Test size: 417 (15.0%)

Train label distribution:
label
0     323
1     105
2    1514
Name: count, dtype: int64


## 4. Creating Dataset class

In [6]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 5. Function for model training and evaluation

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}


def train_and_evaluate_model(
    model_name,
    train_df,
    val_df,
    test_df,
    num_labels=3,
    max_length=256,
    batch_size=16,
    num_epochs=10,
    learning_rate=2e-5,
    patience=3
):
    print(f'\n{"="*60}')
    print(f'Training model: {model_name}')
    print(f'{"="*60}')
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )
    
    # Create datasets
    train_dataset = ReviewDataset(
        train_df['text'].tolist(),
        train_df['label'].tolist(),
        tokenizer,
        max_length
    )
    val_dataset = ReviewDataset(
        val_df['text'].tolist(),
        val_df['label'].tolist(),
        tokenizer,
        max_length
    )
    test_dataset = ReviewDataset(
        test_df['text'].tolist(),
        test_df['label'].tolist(),
        tokenizer,
        max_length
    )
    
    # Output directory
    output_dir = f'./results/{model_name.replace("/", "_")}'
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        warmup_ratio=0.1,
        weight_decay=0.01,
        logging_dir=f'{output_dir}/logs',
        logging_steps=50,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        save_total_limit=2,
        report_to='none',
        fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )
    
    # Training
    start_time = time.time()
    train_result = trainer.train()
    total_training_time = time.time() - start_time
    
    # Get training info
    train_logs = trainer.state.log_history
    
    # Find epoch with minimum validation loss
    eval_losses = [(i, log['eval_loss']) for i, log in enumerate(train_logs) if 'eval_loss' in log]
    if eval_losses:
        best_epoch_idx = min(eval_losses, key=lambda x: x[1])[0]
        best_epoch = [log['epoch'] for log in train_logs if 'eval_loss' in log][eval_losses.index(min(eval_losses, key=lambda x: x[1]))]
    else:
        best_epoch = num_epochs
    
    # Calculate time per iteration
    total_steps = train_result.global_step
    time_per_iteration = total_training_time / total_steps if total_steps > 0 else 0
    
    # Evaluate on test set
    test_results = trainer.evaluate(test_dataset)
    test_accuracy = test_results.get('eval_accuracy', 0)
    
    print(f'\nResults for {model_name}:')
    print(f'  Best epoch (min val loss): {best_epoch}')
    print(f'  Time per iteration: {time_per_iteration:.3f}s')
    print(f'  Total training time: {total_training_time:.1f}s ({total_training_time/60:.1f} min)')
    print(f'  Test accuracy: {test_accuracy:.4f}')
    
    return {
        'model_name': model_name,
        'best_epoch': best_epoch,
        'time_per_iteration': time_per_iteration,
        'total_training_time': total_training_time,
        'test_accuracy': test_accuracy,
        'trainer': trainer
    }

## 6. Model fine-tuning

In [8]:
# Model configurations
MODELS = [
    'sberbank-ai/ruBert-base',
    'cointegrated/rubert-tiny2',
    'google-bert/bert-base-multilingual-cased'
]

# Training parameters
MAX_LENGTH = 256  # Maximum sequence length
BATCH_SIZE = 16   # Batch size (reduce if OOM)
NUM_EPOCHS = 10   # Maximum number of epochs
LEARNING_RATE = 2e-5
PATIENCE = 3      # Early stopping patience

# Store results
results = []

### 6.1 ruBert-base (Sberbank)

In [9]:
result_rubert = train_and_evaluate_model(
    model_name='sberbank-ai/ruBert-base',
    train_df=train_df,
    val_df=val_df,
    test_df=test_df,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    patience=PATIENCE
)
results.append(result_rubert)


Training model: sberbank-ai/ruBert-base


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7b65d4c8-60c7-4811-acd0-125173738072)')' thrown while requesting HEAD https://huggingface.co/ai-forever/ruBert-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4037,0.350391,0.880096
2,0.2477,0.276187,0.913669
3,0.1674,0.32694,0.901679
4,0.1263,0.34577,0.920863
5,0.0851,0.432346,0.901679



Results for sberbank-ai/ruBert-base:
  Best epoch (min val loss): 2.0
  Time per iteration: 0.212s
  Total training time: 129.4s (2.2 min)
  Test accuracy: 0.9089


### 6.2 rubert-tiny2 (Cointegrated)

In [10]:
result_tiny = train_and_evaluate_model(
    model_name='cointegrated/rubert-tiny2',
    train_df=train_df,
    val_df=val_df,
    test_df=test_df,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE * 2,  # Can use larger batch for tiny model
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    patience=PATIENCE
)
results.append(result_tiny)


Training model: cointegrated/rubert-tiny2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9668,0.690167,0.779376
2,0.6538,0.488758,0.800959
3,0.4788,0.391381,0.884892
4,0.3941,0.37043,0.872902
5,0.3087,0.364995,0.882494
6,0.2879,0.352585,0.88729
7,0.2757,0.358365,0.882494
8,0.2687,0.353426,0.889688
9,0.2624,0.349355,0.892086
10,0.2548,0.351522,0.892086



Results for cointegrated/rubert-tiny2:
  Best epoch (min val loss): 9.0
  Time per iteration: 0.091s
  Total training time: 55.3s (0.9 min)
  Test accuracy: 0.8897


### 6.3 bert-base-multilingual-cased (Google)

In [11]:
result_mbert = train_and_evaluate_model(
    model_name='google-bert/bert-base-multilingual-cased',
    train_df=train_df,
    val_df=val_df,
    test_df=test_df,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    patience=PATIENCE
)
results.append(result_mbert)


Training model: google-bert/bert-base-multilingual-cased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5485,0.441589,0.839329
2,0.3905,0.355497,0.889688
3,0.2703,0.402534,0.882494
4,0.241,0.42165,0.868106
5,0.212,0.417719,0.858513



Results for google-bert/bert-base-multilingual-cased:
  Best epoch (min val loss): 2.0
  Time per iteration: 0.214s
  Total training time: 130.3s (2.2 min)
  Test accuracy: 0.8609


## 7. Results table

In [12]:
# Create results dataframe
results_df = pd.DataFrame([
    {
        'Model': r['model_name'],
        'Best Epoch (min val loss)': r['best_epoch'],
        'Time per Iteration (s)': round(r['time_per_iteration'], 3),
        'Total Training Time (min)': round(r['total_training_time'] / 60, 2),
        'Test Accuracy': round(r['test_accuracy'], 4)
    }
    for r in results
])

print('\n' + '='*80)
print('RESULTS SUMMARY')
print('='*80)
print(results_df.to_string(index=False))


RESULTS SUMMARY
                                   Model  Best Epoch (min val loss)  Time per Iteration (s)  Total Training Time (min)  Test Accuracy
                 sberbank-ai/ruBert-base                        2.0                   0.212                       2.16         0.9089
               cointegrated/rubert-tiny2                        9.0                   0.091                       0.92         0.8897
google-bert/bert-base-multilingual-cased                        2.0                   0.214                       2.17         0.8609


In [13]:
# Display as styled table
results_df.style.highlight_max(subset=['Test Accuracy'], color='lightgreen')

Unnamed: 0,Model,Best Epoch (min val loss),Time per Iteration (s),Total Training Time (min),Test Accuracy
0,sberbank-ai/ruBert-base,2.0,0.212,2.16,0.9089
1,cointegrated/rubert-tiny2,9.0,0.091,0.92,0.8897
2,google-bert/bert-base-multilingual-cased,2.0,0.214,2.17,0.8609


## 8. Analysis of results
~~
~~### Model Comparison Table

| Model | Best Epoch | Time per iteration (s) | Total training time (min) | Test Accuracy |
|--------|--------------|----------------------|---------------------------|---------------|
| sberbank-ai/ruBert-base | 2.0 | 0.224 | 2.27 | **0.8969** |
| cointegrated/rubert-tiny2 | 9.0 | 0.097 | 0.99 | 0.8897 |
| google-bert/bert-base-multilingual-cased | 3.0 | 0.230 | 2.80 | 0.8609 |

### Comparison of models

In this work, we fine-tuned three encoder models for the task of restaurant review classification into three classes (negative, neutral, positive):

1. **sberbank-ai/ruBert-base** - Russian-language BERT model from Sberbank with ~178M parameters
2. **cointegrated/rubert-tiny2** - compact Russian-language BERT model with ~29M parameters  
3. **google-bert/bert-base-multilingual-cased** - multilingual BERT model with ~178M parameters

### Key observations:

1. **Quality (accuracy)**:
   - **ruBert-base showed the best result (89.69%)** - the specialized Russian-language model is optimally suited for this task
   - **rubert-tiny2 is very close (88.97%)** - at 6x smaller size, the model shows comparable quality
   - **mBERT lags behind (86.09%)** - the multilingual vocabulary is less effective for Russian text

2. **Training speed**:
   - rubert-tiny2 trains **2.3x faster** (0.99 min vs 2.27 min) due to its smaller size
   - ruBert-base and mBERT have comparable time per iteration (~0.22-0.23 s)
   - Time per iteration for the tiny model is 2.3x smaller (0.097 s vs 0.224 s)

3. **Convergence**:
   - ruBert-base reached minimum val loss at **epoch 2** - fast convergence
   - mBERT - at **epoch 3**
   - rubert-tiny2 - at **epoch 9** - slower convergence, but still faster in total time
   - Early stopping effectively prevents overfitting

### Conclusions and recommendations:

1. **Best choice for quality**: `sberbank-ai/ruBert-base` (89.69% accuracy)
   - Recommended for production systems where quality is critical

2. **Best choice for quality/speed ratio**: `cointegrated/rubert-tiny2` (88.97% accuracy)
   - Quality loss of only 0.72% with a 2.3x speedup
   - Ideal for production with limited resources or latency requirements

3. **mBERT** should only be used for multilingual tasks where support for multiple languages simultaneously is needed

### Data features:

- The dataset is unbalanced: there are significantly more positive reviews (2164) than negative (462) and neutral (150)
- Stratified sampling was used to preserve class distribution
- A total of 2776 reviews were used (train: 1942, val: 417, test: 417)

In [14]:
# Final summary
print('\nFinal Model Comparison:')
print('-' * 60)
for r in results:
    print(f"\n{r['model_name']}:")
    print(f"  - Test Accuracy: {r['test_accuracy']:.4f}")
    print(f"  - Training Time: {r['total_training_time']/60:.1f} min")
    print(f"  - Best Epoch: {r['best_epoch']}")


Final Model Comparison:
------------------------------------------------------------

sberbank-ai/ruBert-base:
  - Test Accuracy: 0.9089
  - Training Time: 2.2 min
  - Best Epoch: 2.0

cointegrated/rubert-tiny2:
  - Test Accuracy: 0.8897
  - Training Time: 0.9 min
  - Best Epoch: 9.0

google-bert/bert-base-multilingual-cased:
  - Test Accuracy: 0.8609
  - Training Time: 2.2 min
  - Best Epoch: 2.0
