In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "3"

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🚀 Starting BERT Model Development...")
print(f"💻 Using device: {device}")

🚀 Starting BERT Model Development...
💻 Using device: cpu


In [2]:
print("📊 Loading cleaned data from Member 1...")

try:
    train_df = pd.read_csv('../data/train_clean.csv')
    test_df = pd.read_csv('../data/test_clean.csv')
    valid_df = pd.read_csv('../data/valid_clean.csv')
    
    print("✅ Data loaded successfully!")
    print(f"Training: {len(train_df)} samples")
    print(f"Test: {len(test_df)} samples")
    print(f"Validation: {len(valid_df)} samples")
    
except FileNotFoundError:
    print("❌ Cleaned data not found!")
    print("Please run Member 1's notebook first (01_data_cleaning.ipynb)")
    exit()

# For demonstration and speed, we'll use a subset of data
# In production, you can use the full dataset
SAMPLE_SIZE = 2000  # Adjust based on your computer's capability
TEST_SIZE = 400

print(f"\n⚡ Using subset for faster training:")
print(f"   Training subset: {SAMPLE_SIZE} samples")
print(f"   Test subset: {TEST_SIZE} samples")

# Sample data (stratified to maintain label balance)
train_sample = train_df.groupby('label_binary').apply(
    lambda x: x.sample(min(len(x), SAMPLE_SIZE//2), random_state=42)
).reset_index(drop=True)

test_sample = test_df.groupby('label_binary').apply(
    lambda x: x.sample(min(len(x), TEST_SIZE//2), random_state=42)
).reset_index(drop=True)

print(f"✅ Sample created - Train: {len(train_sample)}, Test: {len(test_sample)}")

📊 Loading cleaned data from Member 1...
✅ Data loaded successfully!
Training: 10240 samples
Test: 1267 samples
Validation: 1284 samples

⚡ Using subset for faster training:
   Training subset: 2000 samples
   Test subset: 400 samples
✅ Sample created - Train: 2000, Test: 400


In [3]:
print("\n🤖 INITIALIZING BERT MODEL...")

# Load pre-trained DistilBERT (faster than full BERT)
model_name = 'distilbert-base-uncased'

print("📥 Loading tokenizer and model...")
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2,  # Binary classification
    output_attentions=False,
    output_hidden_states=False
)

# Move model to device
model.to(device)

print("✅ BERT model initialized successfully!")
print(f"   Model: {model_name}")
print(f"   Device: {device}")


🤖 INITIALIZING BERT MODEL...
📥 Loading tokenizer and model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ BERT model initialized successfully!
   Model: distilbert-base-uncased
   Device: cpu


In [4]:
print("\n🔤 TOKENIZING TEXT DATA...")

def tokenize_data(texts, labels, max_length=128):
    """
    Tokenize text data for BERT input
    """
    print(f"  🔄 Tokenizing {len(texts)} texts...")
    
    # Tokenize texts
    encodings = tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    return encodings, torch.tensor(labels.values)

# Tokenize training data
train_texts = train_sample['clean_statement'].fillna('')
train_labels = train_sample['label_binary']

train_encodings, train_labels_tensor = tokenize_data(train_texts, train_labels)

# Tokenize test data  
test_texts = test_sample['clean_statement'].fillna('')
test_labels = test_sample['label_binary']

test_encodings, test_labels_tensor = tokenize_data(test_texts, test_labels)

print("✅ Tokenization complete!")
print(f"   Max sequence length: 128")
print(f"   Training tokens shape: {train_encodings['input_ids'].shape}")
print(f"   Test tokens shape: {test_encodings['input_ids'].shape}")


🔤 TOKENIZING TEXT DATA...
  🔄 Tokenizing 2000 texts...
  🔄 Tokenizing 400 texts...
✅ Tokenization complete!
   Max sequence length: 128
   Training tokens shape: torch.Size([2000, 66])
   Test tokens shape: torch.Size([400, 128])


In [5]:
class NewsDataset(torch.utils.data.Dataset):
    """
    Custom Dataset class for BERT training
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
print("📦 Creating PyTorch datasets...")
train_dataset = NewsDataset(train_encodings, train_labels_tensor)
test_dataset = NewsDataset(test_encodings, test_labels_tensor)

print("✅ Datasets created successfully!")

📦 Creating PyTorch datasets...
✅ Datasets created successfully!


In [6]:
print("\n⚙ CONFIGURING TRAINING PARAMETERS...")

# Create models directory
os.makedirs('../models', exist_ok=True)
os.makedirs('../models/saved_bert_model', exist_ok=True)

# Training arguments - UPDATED for newer Transformers version
training_args = TrainingArguments(
    output_dir='../models/bert_results',
    num_train_epochs=3,                 # Number of training epochs
    per_device_train_batch_size=8,      # Batch size (reduce if out of memory)
    per_device_eval_batch_size=16,      # Evaluation batch size
    warmup_steps=100,                   # Warmup steps for learning rate
    weight_decay=0.01,                  # Weight decay for regularization
    logging_dir='../models/bert_logs',  # Directory for storing logs
    logging_steps=50,                   # Log every 50 steps
    eval_strategy="steps",              # CHANGED: evaluation_strategy → eval_strategy
    eval_steps=100,                     # Evaluation frequency
    save_strategy="steps",              # Save model every save_steps
    save_steps=200,                     # Save frequency
    load_best_model_at_end=True,        # Load best model at end
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    report_to=None,                     # Disable wandb logging
    save_total_limit=2,                 # Keep only 2 best models
    seed=42                             # For reproducibility
)

print("✅ Training configuration set!")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Device: {device}")


⚙ CONFIGURING TRAINING PARAMETERS...
✅ Training configuration set!
   Epochs: 3
   Batch size: 8
   Device: cpu
