In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "3"

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üöÄ Starting BERT Model Development...")
print(f"üíª Using device: {device}")

üöÄ Starting BERT Model Development...
üíª Using device: cpu


In [2]:
print("üìä Loading cleaned data from Member 1...")

try:
    train_df = pd.read_csv('../data/train_clean.csv')
    test_df = pd.read_csv('../data/test_clean.csv')
    valid_df = pd.read_csv('../data/valid_clean.csv')
    
    print("‚úÖ Data loaded successfully!")
    print(f"Training: {len(train_df)} samples")
    print(f"Test: {len(test_df)} samples")
    print(f"Validation: {len(valid_df)} samples")
    
except FileNotFoundError:
    print("‚ùå Cleaned data not found!")
    print("Please run Member 1's notebook first (01_data_cleaning.ipynb)")
    exit()

# For demonstration and speed, we'll use a subset of data
# In production, you can use the full dataset
SAMPLE_SIZE = 2000  # Adjust based on your computer's capability
TEST_SIZE = 400

print(f"\n‚ö° Using subset for faster training:")
print(f"   Training subset: {SAMPLE_SIZE} samples")
print(f"   Test subset: {TEST_SIZE} samples")

# Sample data (stratified to maintain label balance)
train_sample = train_df.groupby('label_binary').apply(
    lambda x: x.sample(min(len(x), SAMPLE_SIZE//2), random_state=42)
).reset_index(drop=True)

test_sample = test_df.groupby('label_binary').apply(
    lambda x: x.sample(min(len(x), TEST_SIZE//2), random_state=42)
).reset_index(drop=True)

print(f"‚úÖ Sample created - Train: {len(train_sample)}, Test: {len(test_sample)}")

üìä Loading cleaned data from Member 1...
‚úÖ Data loaded successfully!
Training: 10240 samples
Test: 1267 samples
Validation: 1284 samples

‚ö° Using subset for faster training:
   Training subset: 2000 samples
   Test subset: 400 samples
‚úÖ Sample created - Train: 2000, Test: 400


In [3]:
print("\nü§ñ INITIALIZING BERT MODEL...")

# Load pre-trained DistilBERT (faster than full BERT)
model_name = 'distilbert-base-uncased'

print("üì• Loading tokenizer and model...")
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2,  # Binary classification
    output_attentions=False,
    output_hidden_states=False
)

# Move model to device
model.to(device)

print("‚úÖ BERT model initialized successfully!")
print(f"   Model: {model_name}")
print(f"   Device: {device}")


ü§ñ INITIALIZING BERT MODEL...
üì• Loading tokenizer and model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ BERT model initialized successfully!
   Model: distilbert-base-uncased
   Device: cpu


In [4]:
print("\nüî§ TOKENIZING TEXT DATA...")

def tokenize_data(texts, labels, max_length=128):
    """
    Tokenize text data for BERT input
    """
    print(f"  üîÑ Tokenizing {len(texts)} texts...")
    
    # Tokenize texts
    encodings = tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    return encodings, torch.tensor(labels.values)

# Tokenize training data
train_texts = train_sample['clean_statement'].fillna('')
train_labels = train_sample['label_binary']

train_encodings, train_labels_tensor = tokenize_data(train_texts, train_labels)

# Tokenize test data  
test_texts = test_sample['clean_statement'].fillna('')
test_labels = test_sample['label_binary']

test_encodings, test_labels_tensor = tokenize_data(test_texts, test_labels)

print("‚úÖ Tokenization complete!")
print(f"   Max sequence length: 128")
print(f"   Training tokens shape: {train_encodings['input_ids'].shape}")
print(f"   Test tokens shape: {test_encodings['input_ids'].shape}")


üî§ TOKENIZING TEXT DATA...
  üîÑ Tokenizing 2000 texts...
  üîÑ Tokenizing 400 texts...
‚úÖ Tokenization complete!
   Max sequence length: 128
   Training tokens shape: torch.Size([2000, 66])
   Test tokens shape: torch.Size([400, 128])


In [5]:
class NewsDataset(torch.utils.data.Dataset):
    """
    Custom Dataset class for BERT training
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
print("üì¶ Creating PyTorch datasets...")
train_dataset = NewsDataset(train_encodings, train_labels_tensor)
test_dataset = NewsDataset(test_encodings, test_labels_tensor)

print("‚úÖ Datasets created successfully!")

üì¶ Creating PyTorch datasets...
‚úÖ Datasets created successfully!


In [6]:
print("\n‚öô CONFIGURING TRAINING PARAMETERS...")

# Create models directory
os.makedirs('../models', exist_ok=True)
os.makedirs('../models/saved_bert_model', exist_ok=True)

# Training arguments - UPDATED for newer Transformers version
training_args = TrainingArguments(
    output_dir='../models/bert_results',
    num_train_epochs=3,                 # Number of training epochs
    per_device_train_batch_size=8,      # Batch size (reduce if out of memory)
    per_device_eval_batch_size=16,      # Evaluation batch size
    warmup_steps=100,                   # Warmup steps for learning rate
    weight_decay=0.01,                  # Weight decay for regularization
    logging_dir='../models/bert_logs',  # Directory for storing logs
    logging_steps=50,                   # Log every 50 steps
    eval_strategy="steps",              # CHANGED: evaluation_strategy ‚Üí eval_strategy
    eval_steps=100,                     # Evaluation frequency
    save_strategy="steps",              # Save model every save_steps
    save_steps=200,                     # Save frequency
    load_best_model_at_end=True,        # Load best model at end
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    report_to=None,                     # Disable wandb logging
    save_total_limit=2,                 # Keep only 2 best models
    seed=42                             # For reproducibility
)

print("‚úÖ Training configuration set!")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Device: {device}")


‚öô CONFIGURING TRAINING PARAMETERS...
‚úÖ Training configuration set!
   Epochs: 3
   Batch size: 8
   Device: cpu


In [7]:
try:
    import tensorflow as tf
    if not hasattr(tf, 'random'):
        import tensorflow._api.v2.random as tf_random
        tf.random = tf_random
except ImportError:
    pass

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

def compute_metrics(eval_pred):
    """
    Compute metrics for evaluation
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("\nüéØ INITIALIZING TRAINER...")

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("‚úÖ Trainer initialized successfully!")


üéØ INITIALIZING TRAINER...
‚úÖ Trainer initialized successfully!


In [8]:
print("\nüöÄ STARTING BERT TRAINING...")
print("‚è∞ This may take 10-30 minutes depending on your hardware...")

try:
    # Start training
    training_results = trainer.train()
    
    print("‚úÖ Training completed successfully!")
    print(f"   Final training loss: {training_results.training_loss:.4f}")
    
    # Evaluate on test set
    print("\nüìä Evaluating on test set...")
    eval_results = trainer.evaluate(eval_dataset=test_dataset)
    
    print("‚úÖ Evaluation complete!")
    for metric, value in eval_results.items():
        if 'eval_' in metric:
            print(f"   {metric.replace('eval_', '').title()}: {value:.3f}")

except Exception as e:
    print(f"‚ùå Training failed: {str(e)}")
    print("üí° Try reducing batch size or sample size if out of memory")
    print("   You can also use CPU-only training by setting device='cpu'")


üöÄ STARTING BERT TRAINING...
‚è∞ This may take 10-30 minutes depending on your hardware...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.6897,0.686828,0.51,0.552632,0.51,0.38558
200,0.677,0.681283,0.5675,0.570771,0.5675,0.562443
300,0.6644,0.681836,0.5725,0.573472,0.5725,0.571082
400,0.6224,0.689164,0.5975,0.598807,0.5975,0.596165
500,0.6028,0.702249,0.61,0.610397,0.61,0.609649
600,0.3555,0.882805,0.62,0.625,0.62,0.616162
700,0.3708,0.902345,0.5975,0.597561,0.5975,0.597437


‚úÖ Training completed successfully!
   Final training loss: 0.5616

üìä Evaluating on test set...


‚úÖ Evaluation complete!
   Loss: 0.883
   Accuracy: 0.620
   Precision: 0.625
   Recall: 0.620
   F1: 0.616
   Runtime: 27.966
   Samples_Per_Second: 14.303
   Steps_Per_Second: 0.894


In [9]:
print("\nüíæ SAVING BERT MODEL...")

# Save model and tokenizer
model.save_pretrained('../models/saved_bert_model')
tokenizer.save_pretrained('../models/saved_bert_model')

print("‚úÖ BERT model saved successfully!")
print("üìÅ Files saved:")
print("   - ../models/saved_bert_model/ (directory)")
print("   - Model config, weights, and tokenizer")


üíæ SAVING BERT MODEL...
‚úÖ BERT model saved successfully!
üìÅ Files saved:
   - ../models/saved_bert_model/ (directory)
   - Model config, weights, and tokenizer


In [10]:
print("\nüß™ TESTING BERT MODEL...")

def predict_with_bert(text, model=None, tokenizer=None):
    """
    Make prediction using trained BERT model
    """
    if model is None or tokenizer is None:
        print("Loading saved model...")
        model = DistilBertForSequenceClassification.from_pretrained('../models/saved_bert_model')
        tokenizer = DistilBertTokenizer.from_pretrained('../models/saved_bert_model')
        model.to(device)
    
    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=128
    )
    
    # Move to device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions.max().item()
    
    return {
        'prediction': 'Real' if predicted_class == 1 else 'Fake',
        'confidence': confidence,
        'probabilities': {
            'Fake': predictions[0][0].item(),
            'Real': predictions[0][1].item()
        }
    }

# Test with sample statements
test_statements = [
    "The President announced new infrastructure spending today.",
    "Scientists have proven that the Earth is actually flat!",
    "Stock markets reached record highs following positive earnings reports.",
    "Breaking: Aliens have invaded Earth and are demanding pizza!"
]

print("\nüß™ TESTING BERT WITH SAMPLE STATEMENTS:")
print("="*70)

for i, statement in enumerate(test_statements, 1):
    try:
        result = predict_with_bert(statement, model, tokenizer)
        print(f"\nüì∞ Test {i}:")
        print(f"   Statement: {statement}")
        print(f"   ü§ñ BERT Prediction: {result['prediction']}")
        print(f"   üìä Confidence: {result['confidence']:.3f}")
        print(f"   üî¥ Fake probability: {result['probabilities']['Fake']:.3f}")
        print(f"   üü¢ Real probability: {result['probabilities']['Real']:.3f}")
    except Exception as e:
        print(f"‚ùå Error testing statement {i}: {str(e)}")


üß™ TESTING BERT MODEL...

üß™ TESTING BERT WITH SAMPLE STATEMENTS:

üì∞ Test 1:
   Statement: The President announced new infrastructure spending today.
   ü§ñ BERT Prediction: Fake
   üìä Confidence: 0.809
   üî¥ Fake probability: 0.809
   üü¢ Real probability: 0.191

üì∞ Test 2:
   Statement: Scientists have proven that the Earth is actually flat!
   ü§ñ BERT Prediction: Fake
   üìä Confidence: 0.931
   üî¥ Fake probability: 0.931
   üü¢ Real probability: 0.069

üì∞ Test 3:
   Statement: Stock markets reached record highs following positive earnings reports.
   ü§ñ BERT Prediction: Real
   üìä Confidence: 0.939
   üî¥ Fake probability: 0.061
   üü¢ Real probability: 0.939

üì∞ Test 4:
   Statement: Breaking: Aliens have invaded Earth and are demanding pizza!
   ü§ñ BERT Prediction: Fake
   üìä Confidence: 0.951
   üî¥ Fake probability: 0.951
   üü¢ Real probability: 0.049
