# Mood-Based Restaurant Classifier - Model Training
Fine-tuning DistilBERT for multi-label mood classification

## Setup and Installation

In [None]:
# Install required packages
!pip install transformers datasets torch scikit-learn pandas numpy matplotlib seaborn tqdm -q

In [None]:
# Clone repository (run only in Colab)
import os
if 'COLAB_GPU' in os.environ:
    !git clone https://github.com/Rohanjain2312/mood-based-restaurant-recommender.git
    %cd mood-based-restaurant-recommender

In [None]:
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if device.type == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')

## Load and Explore Data

In [None]:
# Load labeled reviews
with open('data/labeled/labeled_reviews.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f'Total reviews: {len(data)}')
print(f'Sample review:\n{data[0]}')

In [None]:
# Define mood categories
MOODS = ['work', 'date', 'quick_bite', 'budget', 'family', 'late_night', 'celebration']
NUM_LABELS = len(MOODS)

print(f'Mood categories: {MOODS}')
print(f'Number of labels: {NUM_LABELS}')

In [None]:
# Convert to DataFrame for easier manipulation
df = pd.DataFrame(data)

# Create binary labels for each mood
for mood in MOODS:
    df[mood] = df['moods'].apply(lambda x: 1 if mood in x else 0)

print(f'\nDataFrame shape: {df.shape}')
print(f'\nFirst few rows:')
print(df[['review_text', 'moods'] + MOODS].head())

In [None]:
# Analyze mood distribution
mood_counts = df[MOODS].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
mood_counts.plot(kind='bar', color='steelblue')
plt.title('Mood Distribution in Training Data')
plt.xlabel('Mood')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print('\nMood Distribution:')
for mood, count in mood_counts.items():
    percentage = (count / len(df)) * 100
    print(f'{mood}: {count} ({percentage:.1f}%)')

# Calculate average moods per review
avg_moods = df['moods'].apply(len).mean()
print(f'\nAverage moods per review: {avg_moods:.2f}')

## Data Preparation

In [None]:
# Prepare features and labels
texts = df['review_text'].tolist()
labels = df[MOODS].values.astype(np.float32)

print(f'Number of texts: {len(texts)}')
print(f'Labels shape: {labels.shape}')
print(f'Sample label vector: {labels[0]}')

In [None]:
# Split data: 70% train, 15% val, 15% test
# First split: 70% train, 30% temp
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.3, random_state=SEED, stratify=None
)

# Second split: 50% val, 50% test from temp (15% each of original)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=SEED, stratify=None
)

print(f'Train set: {len(train_texts)} samples')
print(f'Validation set: {len(val_texts)} samples')
print(f'Test set: {len(test_texts)} samples')

In [None]:
# Initialize tokenizer
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Tokenization parameters
MAX_LENGTH = 128

print(f'Tokenizer loaded: {MODEL_NAME}')
print(f'Max sequence length: {MAX_LENGTH}')

In [None]:
# Create custom Dataset class
class MoodDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# Create datasets
train_dataset = MoodDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_dataset = MoodDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
test_dataset = MoodDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

print('Datasets created successfully')

In [None]:
# Create DataLoaders
BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f'Batch size: {BATCH_SIZE}')
print(f'Train batches: {len(train_loader)}')
print(f'Val batches: {len(val_loader)}')
print(f'Test batches: {len(test_loader)}')

## Model Setup

In [None]:
# Calculate class weights for imbalanced data
pos_counts = train_labels.sum(axis=0)
neg_counts = len(train_labels) - pos_counts

# Weight = num_negative / num_positive for each class
pos_weights = torch.tensor(neg_counts / pos_counts, dtype=torch.float).to(device)

print('Positive class weights (higher = more emphasis on minority class):')
for mood, weight in zip(MOODS, pos_weights):
    print(f'{mood}: {weight:.2f}')

In [None]:
# Load pre-trained DistilBERT with custom classification head
from torch import nn

class DistilBertForMultiLabelClassification(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.distilbert = DistilBertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        # DistilBERT already has a classifier head, we just need to ensure proper config
        self.distilbert.config.problem_type = "multi_label_classification"
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs

model = DistilBertForMultiLabelClassification(MODEL_NAME, NUM_LABELS)
model = model.to(device)

print(f'Model loaded and moved to {device}')
print(f'\nModel architecture:\n{model}')

In [None]:
# Training hyperparameters
EPOCHS = 4
LEARNING_RATE = 2e-5
WARMUP_STEPS = 100

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Learning rate scheduler
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

# Loss function with class weights
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

print(f'Epochs: {EPOCHS}')
print(f'Learning rate: {LEARNING_RATE}')
print(f'Total training steps: {total_steps}')
print(f'Warmup steps: {WARMUP_STEPS}')

## Training Functions

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc='Training')
    
    for batch in progress_bar:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        
        # Calculate loss
        loss = criterion(logits, labels)
        
        # Backward pass
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Update weights
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device, threshold=0.5):
    """Evaluate model on validation/test set"""
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits
            
            # Calculate loss
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            # Get predictions (apply sigmoid + threshold)
            probs = torch.sigmoid(logits)
            predictions = (probs > threshold).float()
            
            all_predictions.append(predictions.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    # Concatenate all batches
    all_predictions = np.vstack(all_predictions)
    all_labels = np.vstack(all_labels)
    
    # Calculate metrics
    avg_loss = total_loss / len(dataloader)
    f1_micro = f1_score(all_labels, all_predictions, average='micro', zero_division=0)
    f1_macro = f1_score(all_labels, all_predictions, average='macro', zero_division=0)
    f1_per_class = f1_score(all_labels, all_predictions, average=None, zero_division=0)
    
    return avg_loss, f1_micro, f1_macro, f1_per_class, all_predictions, all_labels

## Training Loop

In [None]:
# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'val_f1_micro': [],
    'val_f1_macro': []
}

best_f1 = 0
patience = 2
patience_counter = 0

print('Starting training...\n')

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 50)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, device)
    
    # Evaluate
    val_loss, val_f1_micro, val_f1_macro, val_f1_per_class, _, _ = evaluate(
        model, val_loader, criterion, device
    )
    
    # Update history
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_f1_micro'].append(val_f1_micro)
    history['val_f1_macro'].append(val_f1_macro)
    
    print(f'\nTrain Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f}')
    print(f'Val F1 (micro): {val_f1_micro:.4f}')
    print(f'Val F1 (macro): {val_f1_macro:.4f}')
    
    print('\nPer-class F1 scores:')
    for mood, f1 in zip(MOODS, val_f1_per_class):
        print(f'  {mood}: {f1:.4f}')
    
    # Save best model
    if val_f1_macro > best_f1:
        best_f1 = val_f1_macro
        patience_counter = 0
        
        # Save model
        os.makedirs('models/distilbert-mood-classifier', exist_ok=True)
        model.distilbert.save_pretrained('models/distilbert-mood-classifier')
        tokenizer.save_pretrained('models/distilbert-mood-classifier')
        
        print(f'\nNew best model saved! F1: {best_f1:.4f}')
    else:
        patience_counter += 1
        print(f'\nNo improvement. Patience: {patience_counter}/{patience}')
    
    print('\n')
    
    # Early stopping
    if patience_counter >= patience:
        print(f'Early stopping triggered after {epoch + 1} epochs')
        break

print('Training complete!')

## Plot Training History

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(history['train_loss'], label='Train Loss', marker='o')
axes[0].plot(history['val_loss'], label='Val Loss', marker='o')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True)

# F1 plot
axes[1].plot(history['val_f1_micro'], label='F1 Micro', marker='o')
axes[1].plot(history['val_f1_macro'], label='F1 Macro', marker='o')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('F1 Score')
axes[1].set_title('Validation F1 Scores')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.savefig('models/training_history.png', dpi=300, bbox_inches='tight')
plt.show()

## Final Evaluation on Test Set

In [None]:
# Load best model
best_model = DistilBertForMultiLabelClassification('models/distilbert-mood-classifier', NUM_LABELS)
best_model = best_model.to(device)

print('Best model loaded for final evaluation')

In [None]:
# Evaluate on test set
test_loss, test_f1_micro, test_f1_macro, test_f1_per_class, test_preds, test_labels_actual = evaluate(
    best_model, test_loader, criterion, device
)

print('Test Set Results:')
print('=' * 50)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test F1 (micro): {test_f1_micro:.4f}')
print(f'Test F1 (macro): {test_f1_macro:.4f}')
print('\nPer-mood Performance:')
print('-' * 50)

for mood, f1 in zip(MOODS, test_f1_per_class):
    precision = precision_score(test_labels_actual[:, MOODS.index(mood)], 
                                test_preds[:, MOODS.index(mood)], zero_division=0)
    recall = recall_score(test_labels_actual[:, MOODS.index(mood)], 
                         test_preds[:, MOODS.index(mood)], zero_division=0)
    
    print(f'{mood}:')
    print(f'  Precision: {precision:.4f}')
    print(f'  Recall: {recall:.4f}')
    print(f'  F1: {f1:.4f}')

In [None]:
# Save test metrics
test_metrics = {
    'test_loss': float(test_loss),
    'test_f1_micro': float(test_f1_micro),
    'test_f1_macro': float(test_f1_macro),
    'per_mood_metrics': {}
}

for i, mood in enumerate(MOODS):
    test_metrics['per_mood_metrics'][mood] = {
        'f1': float(test_f1_per_class[i]),
        'precision': float(precision_score(test_labels_actual[:, i], test_preds[:, i], zero_division=0)),
        'recall': float(recall_score(test_labels_actual[:, i], test_preds[:, i], zero_division=0))
    }

with open('models/test_metrics.json', 'w') as f:
    json.dump(test_metrics, f, indent=2)

print('\nTest metrics saved to models/test_metrics.json')

## Save Model to Google Drive (Colab)

In [None]:
# Mount Google Drive (Colab only)
if 'COLAB_GPU' in os.environ:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Copy model to Drive
    !cp -r models/distilbert-mood-classifier /content/drive/MyDrive/
    !cp models/test_metrics.json /content/drive/MyDrive/distilbert-mood-classifier/
    !cp models/training_history.png /content/drive/MyDrive/distilbert-mood-classifier/
    
    print('Model saved to Google Drive!')

## Next Steps
1. Download model from Google Drive
2. Push to GitHub (if < 100MB) or use Git LFS
3. Upload to HuggingFace Hub
4. Integrate into backend API