## 1. Setup Environment

In [None]:
# Install required packages
!pip install -q transformers datasets torch accelerate scikit-learn pandas numpy tqdm joblib

In [None]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib

# Set seed for reproducibility
set_seed(42)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Using device: cuda
GPU: Tesla T4
Memory: 15.83 GB


## 2. Upload Data Files

Upload these files from your local `data/processed/` and `models/` directories:
- `train.csv` (105,286 records)
- `val.csv` (13,161 records)
- `test.csv` (13,161 records)
- `label_encoder.pkl` (from models/ directory)

In [None]:
from google.colab import files

print("Upload train.csv, val.csv, test.csv, and label_encoder.pkl")
uploaded = files.upload()

# Verify uploads
required_files = ['train.csv', 'val.csv', 'test.csv', 'label_encoder.pkl']
for f in required_files:
    if f in uploaded:
        print(f"✓ {f} uploaded successfully")
    else:
        print(f"✗ {f} is missing!")

Upload train.csv, val.csv, test.csv, and label_encoder.pkl


Saving label_encoder.pkl to label_encoder (1).pkl
Saving test.csv to test.csv
Saving train.csv to train.csv
Saving val.csv to val.csv
✓ train.csv uploaded successfully
✓ val.csv uploaded successfully
✓ test.csv uploaded successfully
✗ label_encoder.pkl is missing!


## 3. Load and Prepare Data

In [None]:
# Load datasets
print("Loading datasets...")
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

# Load label encoder
label_encoder = joblib.load('label_encoder.pkl')

print(f"Train: {len(train_df):,} records")
print(f"Val: {len(val_df):,} records")
print(f"Test: {len(test_df):,} records")
print(f"Classes: {label_encoder.classes_}")

# Check sentiment distribution
print("\nSentiment Distribution (Train):")
print(train_df['sentiment'].value_counts())
print("\nSample cleaned text:")
print(train_df['cleaned_text'].iloc[0][:200])

Loading datasets...
Train: 105,286 records
Val: 13,161 records
Test: 13,161 records
Classes: ['Negative' 'Neutral' 'Positive']

Sentiment Distribution (Train):
sentiment
Neutral     45768
Positive    42816
Negative    16702
Name: count, dtype: int64

Sample cleaned text:
Dhinilo qualify aithe asalu yem work istharu salary yentha undochu. Nenu raasanu e exam


In [None]:
# Load tokenizer
print("Loading DistilBERT tokenizer...")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode labels
train_df['label'] = label_encoder.transform(train_df['sentiment'])
val_df['label'] = label_encoder.transform(val_df['sentiment'])
test_df['label'] = label_encoder.transform(test_df['sentiment'])

# Create HuggingFace datasets
print("Creating datasets...")
train_dataset = Dataset.from_pandas(train_df[['cleaned_text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['cleaned_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['cleaned_text', 'label']])

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples['cleaned_text'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

# Tokenize datasets
print("Tokenizing datasets (this takes ~2-3 minutes)...")
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['cleaned_text'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['cleaned_text'])
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=['cleaned_text'])

train_dataset.set_format('torch')
val_dataset.set_format('torch')
test_dataset.set_format('torch')

print("✓ Data preparation complete!")

Loading DistilBERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Creating datasets...
Tokenizing datasets (this takes ~2-3 minutes)...


Map:   0%|          | 0/105286 [00:00<?, ? examples/s]

Map:   0%|          | 0/13161 [00:00<?, ? examples/s]

Map:   0%|          | 0/13161 [00:00<?, ? examples/s]

✓ Data preparation complete!


## 4. Load Model and Configure Training

In [None]:
# Load DistilBERT model
print("Loading DistilBERT model (66M parameters)...")
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3
)

model.to(device)
print(f"✓ Model loaded on {device}")

Loading DistilBERT model (66M parameters)...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded on cuda


In [None]:
# Define metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./distilbert_checkpoints',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,  # GPU can handle larger batches
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    save_total_limit=2,
    report_to='none',
    fp16=True  # Mixed precision for faster training
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Mixed precision (fp16): {training_args.fp16}")

Training configuration:
  Epochs: 3
  Batch size: 32
  Learning rate: 2e-05
  Mixed precision (fp16): True


## 5. Train Model

This will take approximately **15-25 minutes** on a T4 GPU.

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("="*60)
print("STARTING TRAINING")
print("="*60)
print("Target: Beat 88.57% baseline accuracy")
print("="*60)

# Train
train_result = trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)
print(f"Training time: {train_result.metrics['train_runtime']:.2f}s ({train_result.metrics['train_runtime']/60:.2f} mins)")
print(f"Training loss: {train_result.metrics['train_loss']:.4f}")

STARTING TRAINING
Target: Beat 88.57% baseline accuracy


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.1794,0.148973,0.953727,0.941844,0.95403
2,0.1127,0.128991,0.960033,0.950635,0.960264
3,0.0845,0.142279,0.963529,0.954463,0.963568



TRAINING COMPLETE!
Training time: 1765.28s (29.42 mins)
Training loss: 0.1722


## 6. Evaluate on Test Set

In [None]:
print("="*60)
print("EVALUATING ON TEST SET")
print("="*60)

# Predict on test set
test_results = trainer.predict(test_dataset)
test_metrics = test_results.metrics

print(f"\nTest Accuracy: {test_metrics['test_accuracy']:.4f} ({test_metrics['test_accuracy']*100:.2f}%)")
print(f"F1 Macro: {test_metrics['test_f1_macro']:.4f}")
print(f"F1 Weighted: {test_metrics['test_f1_weighted']:.4f}")

# Detailed classification report
predictions = np.argmax(test_results.predictions, axis=1)
true_labels = test_results.label_ids # Corrected line

report = classification_report(
    true_labels,
    predictions,
    target_names=label_encoder.classes_,
    digits=4
)

print("\nClassification Report:")
print(report)

# Confusion matrix
cm = confusion_matrix(true_labels, predictions)
print("\nConfusion Matrix:")
print(cm)

# Per-class analysis
print("\nPer-Class F1 Scores:")
for i, class_name in enumerate(label_encoder.classes_):
    f1 = f1_score(true_labels, predictions, labels=[i], average='macro')
    print(f"  {class_name}: {f1:.4f}")

EVALUATING ON TEST SET



Test Accuracy: 0.9647 (96.47%)
F1 Macro: 0.9563
F1 Weighted: 0.9647

Classification Report:
              precision    recall  f1-score   support

    Negative     0.9240    0.9253    0.9246      2088
     Neutral     0.9856    0.9705    0.9780      5721
    Positive     0.9586    0.9738    0.9662      5352

    accuracy                         0.9647     13161
   macro avg     0.9561    0.9565    0.9563     13161
weighted avg     0.9649    0.9647    0.9647     13161


Confusion Matrix:
[[1932   37  119]
 [  63 5552  106]
 [  96   44 5212]]

Per-Class F1 Scores:
  Negative: 0.9246
  Neutral: 0.9780
  Positive: 0.9662


## 7. Compare with Baseline

In [None]:
print("="*60)
print("COMPARISON WITH BASELINE")
print("="*60)

baseline_accuracy = 0.8857
transformer_accuracy = test_metrics['test_accuracy']

print(f"\nBaseline (Linear SVM):  {baseline_accuracy:.4f} ({baseline_accuracy*100:.2f}%)")
print(f"DistilBERT Transformer: {transformer_accuracy:.4f} ({transformer_accuracy*100:.2f}%)")

improvement = (transformer_accuracy - baseline_accuracy) * 100

if improvement > 0:
    print(f"\n✓ SUCCESS! Improvement: +{improvement:.2f}%")
    print(f"  Relative improvement: {(improvement/baseline_accuracy)*100:.2f}%")
else:
    print(f"\n⚠ Change: {improvement:.2f}%")

print("\n" + "="*60)
print("PHASE 3 COMPLETE")
print("="*60)

COMPARISON WITH BASELINE

Baseline (Linear SVM):  0.8857 (88.57%)
DistilBERT Transformer: 0.9647 (96.47%)

✓ SUCCESS! Improvement: +7.90%
  Relative improvement: 891.59%

PHASE 3 COMPLETE


## 8. Save Results and Model

In [None]:
# Save results to file
with open('distilbert_results.txt', 'w') as f:
    f.write("="*60 + "\n")
    f.write("DISTILBERT EVALUATION RESULTS (GPU Training)\n")
    f.write("="*60 + "\n\n")
    f.write(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}\n")
    f.write(f"F1 Macro: {test_metrics['test_f1_macro']:.4f}\n")
    f.write(f"F1 Weighted: {test_metrics['test_f1_weighted']:.4f}\n")
    f.write(f"Training Time: {train_result.metrics['train_runtime']:.2f}s\n\n")
    f.write("Classification Report:\n")
    f.write(report)
    f.write("\n\nConfusion Matrix:\n")
    f.write(str(cm))
    f.write("\n\n" + "="*60 + "\n")
    f.write("COMPARISON WITH BASELINE\n")
    f.write("="*60 + "\n")
    f.write(f"Baseline (Linear SVM): {baseline_accuracy:.4f}\n")
    f.write(f"DistilBERT: {transformer_accuracy:.4f}\n")
    f.write(f"Improvement: {improvement:+.2f}%\n")

print("Results saved to distilbert_results.txt")

# Save model
trainer.save_model('./distilbert_final_model')
print("Model saved to ./distilbert_final_model")

# Download results
print("\nDownloading results file...")
files.download('distilbert_results.txt')

Results saved to distilbert_results.txt
Model saved to ./distilbert_final_model

Downloading results file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 9. Download Model (Optional)

If you want to use the trained model locally:

In [None]:
# Zip and download model
!zip -r distilbert_final_model.zip distilbert_final_model/
files.download('distilbert_final_model.zip')
print("Model downloaded! Extract and place in your phase3_transformer_models/checkpoints/ directory")

  adding: distilbert_final_model/ (stored 0%)
  adding: distilbert_final_model/training_args.bin (deflated 53%)
  adding: distilbert_final_model/model.safetensors (deflated 8%)
  adding: distilbert_final_model/config.json (deflated 49%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded! Extract and place in your phase3_transformer_models/checkpoints/ directory


---

## Summary

**Phase 3 Implementation:**
- ✓ DistilBERT fine-tuned on 105K+ YouTube comments
- ✓ GPU-accelerated training (15-25 minutes vs 13+ hours on CPU)
- ✓ Production-grade implementation with HuggingFace Trainer
- ✓ Comprehensive evaluation metrics

**Next Steps:**
1. Download `distilbert_results.txt` and review results
2. Optionally download the trained model
3. Compare with your 88.57% baseline
4. Document findings in your project

**Optional Enhancement:**
- Train MuRIL model for better Hinglish performance (similar notebook)
- Experiment with different hyperparameters
- Try longer sequences (max_length=512) for better context