# 🎯 Text Specialist Model

In [1]:
# 📋 CELL 1: Setup & Imports (⏱️ ~10 seconds)
# Enhanced multilingual emotion prediction pipeline for Whisper-transcribed text

# Core imports
import json
import os
import time
import re
import string
from typing import List, Dict, Tuple
from collections import defaultdict

# Scientific computing
import pandas as pd
import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler

# Transformers and ML
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load label mapping
with open("artifacts/label2idx.json", "r") as f:
    label2idx = json.load(f)
    
print(f"Loaded {len(label2idx)} emotion classes: {list(label2idx.keys())}")
num_classes = len(label2idx)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Loaded 8 emotion classes: ['Anger', 'Fear', 'Joy', 'Neutral', 'Proud', 'Sadness', 'Surprise', 'Trust']


In [2]:
# 🚀 CELL 2: GPU Setup & Optimization (⏱️ ~5 seconds)
print("🔥 GPU SETUP")
print("=" * 40)

print(f"✅ CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"🎯 Device: {device}")
    print(f"🔧 GPU: {torch.cuda.get_device_name(0)}")
    
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"💾 Memory: {gpu_memory:.1f} GB")
    
    # Quick GPU test (reduced size for faster execution)
    print("🧪 Testing GPU...")
    test_tensor = torch.randn(100, 100).to(device)  # Reduced from 1000x1000 to 100x100
    result = torch.mm(test_tensor, test_tensor)
    print(f"✅ GPU test: Success")
    
    # Optimization settings
    torch.backends.cudnn.benchmark = True
    
    # Mixed precision check
    try:
        use_amp = True
        print("✅ Mixed precision available")
    except ImportError:
        use_amp = False
        print("⚠️  Mixed precision not available")
    
    # Recommended batch size based on GPU memory
    if gpu_memory >= 24:
        recommended_batch = 32
        print(f"🔥 High-end GPU: batch_size=32 recommended")
    elif gpu_memory >= 12:
        recommended_batch = 16
        print(f"🚀 Mid-range GPU: batch_size=16 recommended")
    elif gpu_memory >= 8:
        recommended_batch = 12
        print(f"⚡ Entry GPU: batch_size=12 recommended")
    else:
        recommended_batch = 4
        print(f"🤏 Low memory GPU: batch_size=4 recommended")
    
else:
    print("❌ CUDA not available - using CPU")
    print("⚠️  Training will be slow on CPU")
    recommended_batch = 8
    use_amp = False

print("=" * 40)

🔥 GPU SETUP
✅ CUDA Available: True
🎯 Device: cuda
🔧 GPU: NVIDIA GeForce RTX 4070
💾 Memory: 12.0 GB
🧪 Testing GPU...
✅ GPU test: Success
✅ Mixed precision available
⚡ Entry GPU: batch_size=12 recommended


In [3]:
# 🏋️ CELL 4: Simple Fast Training Function (⏱️ ~1 second)
def train_specialist_verbose(model, train_loader, val_loader, num_epochs=5, lr=1e-5, use_temporal=True, use_mixed_precision=True):
    """Lightweight training function - instant definition"""
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scaler = GradScaler() if use_mixed_precision and device.type == 'cuda' else None
    
    best_val_f1 = 0.0
    training_history = []
    
    print("🚀 TRAINING STARTED")
    print(f"📊 {len(train_loader.dataset):,} samples, {len(train_loader)} batches")
    
    for epoch in range(num_epochs):
        print(f"\n🔥 Epoch {epoch+1}/{num_epochs}")
        epoch_start = time.time()
        
        # Training
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch_idx, (tokens, y, metadata) in enumerate(train_loader):
            input_ids = tokens['input_ids'].to(device)
            attention_mask = tokens['attention_mask'].to(device)
            y = y.to(device)
            temporal_features = metadata['temporal_features'].to(device) if use_temporal else None
            
            optimizer.zero_grad()
            
            if scaler:
                with autocast():
                    logits = model(input_ids=input_ids, attention_mask=attention_mask, temporal_features=temporal_features)
                    loss = criterion(logits, y)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                logits = model(input_ids=input_ids, attention_mask=attention_mask, temporal_features=temporal_features)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()
            
            train_loss += loss.item()
            pred = logits.argmax(dim=1)
            train_correct += (pred == y).sum().item()
            train_total += y.size(0)
            
            # Progress every 20 batches
            if batch_idx % 20 == 0:
                samples_done = min((batch_idx + 1) * train_loader.batch_size, len(train_loader.dataset))
                progress = (batch_idx + 1) / len(train_loader) * 100
                acc = (train_correct / train_total) * 100 if train_total > 0 else 0
                
                print(f"  📈 {samples_done:,} samples ({progress:.1f}%) | Loss: {loss.item():.3f} | Acc: {acc:.1f}%")
        
        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for tokens, y, metadata in val_loader:
                input_ids = tokens['input_ids'].to(device)
                attention_mask = tokens['attention_mask'].to(device)
                y = y.to(device)
                temporal_features = metadata['temporal_features'].to(device) if use_temporal else None
                
                logits = model(input_ids=input_ids, attention_mask=attention_mask, temporal_features=temporal_features)
                loss = criterion(logits, y)
                
                val_loss += loss.item()
                pred = logits.argmax(dim=1)
                val_correct += (pred == y).sum().item()
                val_total += y.size(0)
                
                all_preds.extend(pred.cpu().numpy())
                all_labels.extend(y.cpu().numpy())
        
        # Results
        train_acc = (train_correct / train_total) * 100
        val_acc = (val_correct / val_total) * 100
        f1_macro = f1_score(all_labels, all_preds, average="macro")
        epoch_time = time.time() - epoch_start
        
        print(f"✅ Train: {train_acc:.1f}% | Val: {val_acc:.1f}% | F1: {f1_macro:.3f} | Time: {epoch_time:.1f}s")
        
        if f1_macro > best_val_f1:
            best_val_f1 = f1_macro
            print(f"🏆 NEW BEST F1: {best_val_f1:.3f}")
        
        training_history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss / len(train_loader),
            'train_acc': train_acc,
            'val_loss': val_loss / len(val_loader),
            'val_acc': val_acc,
            'f1_macro': f1_macro,
            'epoch_time': epoch_time
        })
    
    print(f"\n🏆 DONE! Best F1: {best_val_f1:.3f}")
    return model, training_history

print("✅ Training function ready!")

✅ Training function ready!


In [4]:
# 🧹 CELL 3: Text Preprocessor for Whisper (⏱️ ~2 seconds)
class WhisperTextPreprocessor:
    """
    Specialized preprocessing for Whisper-transcribed text
    Handles common transcription artifacts and multilingual content
    """
    
    def __init__(self):
        # Common Whisper artifacts to clean
        self.whisper_artifacts = [
            r'\[.*?\]',  # Remove [MUSIC], [NOISE] etc.
            r'\(.*?\)',  # Remove (unintelligible) etc.
            r'\.{3,}',   # Remove excessive dots
            r'-{2,}',    # Remove excessive dashes
            r'\s{2,}',   # Multiple spaces to single space
        ]
        
        # Indonesian filler words and false starts
        self.filler_words = ['eh', 'em', 'uh', 'um', 'ya', 'gitu', 'jadi']
        
    def clean_whisper_text(self, text: str) -> str:
        """Clean common Whisper transcription artifacts"""
        if pd.isna(text) or text == '':
            return ""
            
        text = str(text).lower().strip()
        
        # Remove Whisper artifacts
        for pattern in self.whisper_artifacts:
            text = re.sub(pattern, ' ', text)
        
        # Remove excessive punctuation
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # Remove numbers (often transcription errors)
        text = re.sub(r'\d+', '', text)
        
        # Clean whitespace
        text = ' '.join(text.split())
        
        return text
    
    def extract_temporal_features(self, text: str, start_time: float, end_time: float) -> Dict:
        """Extract features that might correlate with temporal position"""
        duration = end_time - start_time
        word_count = len(text.split()) if text else 0
        speech_rate = word_count / duration if duration > 0 else 0
        
        return {
            'duration': duration,
            'word_count': word_count, 
            'speech_rate': speech_rate,
            'start_time_norm': start_time,  # Can normalize later
            'text_length': len(text)
        }

print("✅ WhisperTextPreprocessor class ready")

✅ WhisperTextPreprocessor class ready


In [5]:
# 📊 CELL 4: Enhanced Dataset with Temporal Features (⏱️ ~3 seconds)
class EnhancedTextDataset(torch.utils.data.Dataset):
    """
    Enhanced dataset for Whisper-transcribed text with temporal features
    """
    
    def __init__(self, manifest_csv, split="train", fold=0, 
                 model_name="bert-base-multilingual-cased", max_len=128, 
                 label2idx=None, use_temporal_features=True,
                 apply_text_cleaning=True):
        
        df = pd.read_csv(manifest_csv)
        
        # Filter by split and fold
        if split == "train":
            self.data = df[df['fold'] != fold].reset_index(drop=True)
        else:
            self.data = df[df['fold'] == fold].reset_index(drop=True)
        
        print(f"📊 {split.title()} split (fold {fold}): {len(self.data)} samples")
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label2idx = label2idx or {}
        self.use_temporal_features = use_temporal_features
        self.preprocessor = WhisperTextPreprocessor() if apply_text_cleaning else None
        
        # Preprocess text data
        if self.preprocessor:
            print("🧹 Cleaning Whisper text...")
            self.data['text_snippet'] = self.data['text_snippet'].apply(
                self.preprocessor.clean_whisper_text
            )
        
        # Extract temporal features if enabled
        if self.use_temporal_features:
            print("⏰ Extracting temporal features...")
            temporal_features = []
            for _, row in self.data.iterrows():
                features = self.preprocessor.extract_temporal_features(
                    row['text_snippet'], row['start'], row['end']
                ) if self.preprocessor else {
                    'duration': row['end'] - row['start'],
                    'word_count': len(str(row['text_snippet']).split()),
                    'speech_rate': 0,
                    'start_time_norm': row['start'],
                    'text_length': len(str(row['text_snippet']))
                }
                temporal_features.append(features)
            
            # Add temporal features to dataframe
            temporal_df = pd.DataFrame(temporal_features)
            for col in temporal_df.columns:
                self.data[f'temp_{col}'] = temporal_df[col]
        
        print(f"✅ Dataset ready: {len(self.data)} samples with temporal features")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = str(row['text_snippet'])
        label = self.label2idx[row['label']]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        # Prepare temporal features
        temporal_features = []
        if self.use_temporal_features:
            temporal_cols = ['temp_duration', 'temp_word_count', 'temp_speech_rate', 
                           'temp_start_time_norm', 'temp_text_length']
            temporal_features = [float(row[col]) for col in temporal_cols if col in row]
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'temporal_features': torch.tensor(temporal_features, dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.long)
        }

print("✅ EnhancedTextDataset class ready")

✅ EnhancedTextDataset class ready


In [6]:
# 🧠 CELL 6: Fast Model Architecture (⏱️ ~3 seconds)
class FastEmotionClassifier(nn.Module):
    """Fast emotion classifier for quick testing"""
    
    def __init__(self, model_name="prajjwal1/bert-mini", num_classes=8, 
                 use_temporal=True, dropout_rate=0.3):
        super().__init__()
        
        # Use DistilBERT for faster loading and inference
        print(f"🚀 Loading fast model: {model_name}")
        self.bert = AutoModel.from_pretrained(model_name)
        self.model_name = model_name
        self.use_temporal = use_temporal
        
        # Get hidden size
        hidden_size = self.bert.config.hidden_size
        
        # Simple temporal processing
        if use_temporal:
            self.temporal_dim = 5
            self.temporal_processor = nn.Linear(self.temporal_dim, 32)
            combined_size = hidden_size + 32
        else:
            combined_size = hidden_size
        
        # Simple classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(combined_size, num_classes)
        )
        
        print(f"✅ Fast model ready with {sum(p.numel() for p in self.parameters()):,} parameters")
    
    def forward(self, input_ids, attention_mask, temporal_features=None):
        # BERT encoding
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        
        # Add temporal features if available
        if self.use_temporal and temporal_features is not None:
            temporal_processed = self.temporal_processor(temporal_features)
            combined_features = torch.cat([cls_output, temporal_processed], dim=1)
        else:
            combined_features = cls_output
        
        return self.classifier(combined_features)

# Initialize ultra-lightweight model
print("🧠 Initializing Ultra-Lightweight Emotion Classifier...")
model = FastEmotionClassifier(
    model_name="prajjwal1/bert-mini",  # Ultra-lightweight: only 11M parameters!
    num_classes=num_classes,
    use_temporal=True,
    dropout_rate=0.3
)

model = model.to(device)
print(f"✅ Model ready on {device}")

🧠 Initializing Ultra-Lightweight Emotion Classifier...
🚀 Loading fast model: prajjwal1/bert-mini




  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


✅ Fast model ready with 11,173,064 parameters
✅ Model ready on cuda


In [7]:
# 🔧 CELL 7: Fast Dataset Loading (⏱️ ~5-10 seconds)
print("🔧 FAST DATASET LOADING")
print("=" * 40)

# Quick file discovery
artifacts_path = "artifacts" 
available_files = [f for f in os.listdir(artifacts_path) if f.endswith('.csv')]

# Find dataset file quickly
manifest_file = None
for preferred in ['train_manifest_fold0.csv', 'train_manifest_all.csv']:
    if preferred in available_files:
        manifest_file = f"artifacts/{preferred}"
        break

if not manifest_file:
    manifest_file = f"artifacts/{available_files[0]}"  # Use first available

print(f"📂 Using: {manifest_file}")

# Load data
df = pd.read_csv(manifest_file)
print(f"📊 Dataset: {df.shape[0]:,} samples")

# Add timing columns if missing
if not all(col in df.columns for col in ['start', 'end']):
    df['start'] = 0.0
    df['end'] = 1.0

# Quick train/val split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# FULL MODE - Use ALL available data (since data was already reduced in preprocessing)
TRAINING_MODE = "FULL"
print(f"🎯 Using ALL available data (already preprocessed/reduced)")
print(f"📊 Full dataset: {len(train_df):,} train, {len(val_df):,} validation")

# Save splits
train_df.to_csv("artifacts/temp_train_full.csv", index=False)
val_df.to_csv("artifacts/temp_val_full.csv", index=False)

# Simple collation function
def collate_fn_simple(batch):
    tokens = {
        'input_ids': torch.stack([item['input_ids'] for item in batch]),
        'attention_mask': torch.stack([item['attention_mask'] for item in batch])
    }
    labels = torch.stack([item['label'] for item in batch])
    temporal_features = torch.stack([item['temporal_features'] for item in batch])
    metadata = {'temporal_features': temporal_features}
    return tokens, labels, metadata

# Fast dataset class (minimal processing)
class FastTextDataset(torch.utils.data.Dataset):
    def __init__(self, csv_path, label2idx, model_name, max_len=128):
        self.data = pd.read_csv(csv_path)
        self.label2idx = label2idx
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = str(row['text_snippet'])[:200]  # Limit text length for speed
        label = self.label2idx[row['label']]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        # Simple temporal features (no complex computation)
        temporal_features = torch.tensor([
            row.get('end', 1.0) - row.get('start', 0.0),  # duration
            len(text.split()),  # word count
            0.5,  # dummy speech rate
            row.get('start', 0.0),  # start time
            len(text)  # text length
        ], dtype=torch.float)
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long),
            'temporal_features': temporal_features
        }

print("📦 Creating full datasets...")

# Create full datasets using ALL available data
train_dataset = FastTextDataset("artifacts/temp_train_full.csv", label2idx, model.model_name)
val_dataset = FastTextDataset("artifacts/temp_val_full.csv", label2idx, model.model_name)

# Optimize batch size based on dataset size and GPU memory
dataset_size = len(train_dataset)
if dataset_size > 10000:
    batch_size = recommended_batch  # Use GPU-optimized batch size from Cell 2
elif dataset_size > 5000:
    batch_size = 16
else:
    batch_size = 12

print(f"🚀 Optimized batch size: {batch_size} for {dataset_size:,} samples")

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn_simple,
    num_workers=2,  # Enable workers for faster loading
    pin_memory=True  # Enable for better GPU performance
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn_simple,
    num_workers=2,  # Enable workers for faster loading
    pin_memory=True  # Enable for better GPU performance
)

# Store info
train_loader.training_mode = TRAINING_MODE
train_loader.estimated_time = "15-45 minutes (depending on dataset size)"

print(f"✅ Full datasets ready: {len(train_dataset):,} train, {len(val_dataset):,} val")
print(f"🚀 Batch size: {batch_size} | Mode: {TRAINING_MODE}")
print(f"💡 Using ALL available data - ready for full training!")

🔧 FAST DATASET LOADING
📂 Using: artifacts/train_manifest_fold0.csv
📊 Dataset: 4,271 samples
🎯 Using ALL available data (already preprocessed/reduced)
📊 Full dataset: 3,416 train, 855 validation
📦 Creating full datasets...




🚀 Optimized batch size: 12 for 3,416 samples
✅ Full datasets ready: 3,416 train, 855 val
🚀 Batch size: 12 | Mode: FULL
💡 Using ALL available data - ready for full training!


In [None]:
# 🏋️ CELL 8: Start Training (⏱️ ~15-45 minutes)
# IMPORTANT: Run Cell 7 (Dataset Loading) first

# Check if data loaders exist
try:
    _ = train_loader
    _ = val_loader
    print("✅ Data loaders found!")
    print(f"📊 Training: {len(train_loader.dataset):,} | Validation: {len(val_loader.dataset):,}")
except NameError:
    print("❌ Error: train_loader and val_loader not found!")
    print("💡 Please run Cell 7 first to load the datasets")

# Start training with optimized parameters for FULL mode
trained_model, training_history = train_specialist_verbose(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=5,  # Full training epochs
    lr=2e-5,       # Optimized learning rate for full dataset
    use_temporal=True,
    use_mixed_precision=use_amp
)

print("🎯 Training completed! Check training_history for results.")

✅ Data loaders found!
📊 Training: 3,416 | Validation: 855


  scaler = GradScaler() if use_mixed_precision and device.type == 'cuda' else None


🚀 TRAINING STARTED
📊 3,416 samples, 285 batches

🔥 Epoch 1/5


In [None]:
# 💾 CELL 9: Save Trained Model (⏱️ ~5 seconds)
def save_trained_model():
    """Save the trained model and training history"""
    
    try:
        # Check if trained model exists
        _ = trained_model
        _ = training_history
        print("✅ Trained model found!")
    except NameError:
        print("❌ Error: trained_model not found!")
        print("💡 Please run Cell 8 first to train the model")
        return False
    
    print("💾 Saving Trained Model...")
    print("=" * 40)
    
    # Save model
    model_path = "checkpoints/text_specialist_model.pth"
    torch.save({
        'model_state_dict': trained_model.state_dict(),
        'model_config': {
            'model_name': trained_model.model_name,
            'num_classes': num_classes,
            'use_temporal': True,
            'dropout_rate': 0.3
        },
        'label2idx': label2idx,
        'training_history': training_history
    }, model_path)
    
    print(f"✅ Model saved to: {model_path}")
    
    # Save training history
    history_path = "checkpoints/training_history.json"
    import json
    with open(history_path, 'w') as f:
        json.dump(training_history, f, indent=2)
    
    print(f"✅ Training history saved to: {history_path}")
    
    # Print final summary
    if training_history:
        best_f1 = max(epoch['f1_macro'] for epoch in training_history)
        final_f1 = training_history[-1]['f1_macro']
        print(f"\n🏆 Training Summary:")
        print(f"   🎯 Best F1 Score: {best_f1:.4f}")
        print(f"   📊 Final F1 Score: {final_f1:.4f}")
        print(f"   📈 Epochs trained: {len(training_history)}")
    
    return True

# Uncomment the line below to save the trained model
# save_trained_model()
print("💡 Uncomment the last line above to save trained model")