In [1]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F

In [4]:
# sklearn imports
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, roc_curve
)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.calibration import CalibratedClassifierCV

# transformers for BERT
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)

# Additional imports
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import os
import json
import time
from datetime import datetime
import optuna  # for hyperparameter optimization
import mlflow  # for experiment tracking
import joblib

warnings.filterwarnings('ignore')

In [5]:
# Set up paths
BASE_PATH = "/home/ghost/fake-news-game-theory/data"
PROCESSED_PATH = os.path.join(BASE_PATH, "processed")
MODELS_PATH = os.path.join(BASE_PATH, "models")
RESULTS_PATH = os.path.join(BASE_PATH, "results")

# Create directories
for path in [MODELS_PATH, RESULTS_PATH]:
    os.makedirs(path, exist_ok=True)

print("Model Training Pipeline Initialized")
print(f"Models will be saved to: {MODELS_PATH}")
print(f"Results will be saved to: {RESULTS_PATH}")

Model Training Pipeline Initialized
Models will be saved to: /home/ghost/fake-news-game-theory/data/models
Results will be saved to: /home/ghost/fake-news-game-theory/data/results


In [6]:
## 1. Data Loading and Preprocessing

class DataLoader:
    """Load and prepare data for model training"""
    
    def __init__(self, processed_path=PROCESSED_PATH):
        self.processed_path = processed_path
        self.scaler = None
        self.feature_names = None
        self.tfidf_vectorizer = None
        
    def load_training_data(self):
        """Load preprocessed training data"""
        print("Loading training data...")
        
        # Load features and labels
        X_train = pd.read_csv(os.path.join(self.processed_path, 'train/X_train.csv'))
        y_train = pd.read_csv(os.path.join(self.processed_path, 'train/y_train.csv'))
        
        X_val = pd.read_csv(os.path.join(self.processed_path, 'validation/X_val.csv'))
        y_val = pd.read_csv(os.path.join(self.processed_path, 'validation/y_val.csv'))
        
        X_test = pd.read_csv(os.path.join(self.processed_path, 'test/X_test.csv'))
        y_test = pd.read_csv(os.path.join(self.processed_path, 'test/y_test.csv'))
        
        # Load preprocessing objects
        with open(os.path.join(self.processed_path, 'features/scaler.pkl'), 'rb') as f:
            self.scaler = pickle.load(f)
            
        with open(os.path.join(self.processed_path, 'features/feature_names.pkl'), 'rb') as f:
            self.feature_names = pickle.load(f)
            
        with open(os.path.join(self.processed_path, 'features/tfidf_vectorizer.pkl'), 'rb') as f:
            self.tfidf_vectorizer = pickle.load(f)
        
        # Convert to numpy arrays and flatten labels
        y_train = y_train.values.ravel()
        y_val = y_val.values.ravel()
        y_test = y_test.values.ravel()
        
        print(f"Training set: {X_train.shape}")
        print(f"Validation set: {X_val.shape}")
        print(f"Test set: {X_test.shape}")
        print(f"Features: {len(self.feature_names)}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test)
    
    def load_raw_text_data(self):
        """Load raw text data for BERT training"""
        print("Loading raw text data for BERT...")
        
        # Load the complete features file to get text
        all_features = pd.read_csv(os.path.join(self.processed_path, 'features/all_features.csv'))
        
        # Extract text and labels
        texts = all_features['text'].fillna('').astype(str).tolist()
        labels = all_features['label'].values
        
        # Create train/val/test splits matching the feature splits
        train_size = 3500
        val_size = 500
        test_size = 1000
        
        train_texts = texts[:train_size]
        train_labels = labels[:train_size]
        
        val_texts = texts[train_size:train_size + val_size]
        val_labels = labels[train_size:train_size + val_size]
        
        test_texts = texts[train_size + val_size:train_size + val_size + test_size]
        test_labels = labels[train_size + val_size:train_size + val_size + test_size]
        
        print(f"Text data loaded: {len(train_texts)} train, {len(val_texts)} val, {len(test_texts)} test")
        
        return (train_texts, train_labels), (val_texts, val_labels), (test_texts, test_labels)

In [7]:
## 2. Traditional Machine Learning Models

class TraditionalMLTrainer:
    """Train and evaluate traditional ML models"""
    
    def __init__(self):
        self.models = {}
        self.results = {}
        
    def initialize_models(self):
        """Initialize baseline models"""
        self.models = {
            'logistic_regression': LogisticRegression(
                random_state=42, max_iter=1000, class_weight='balanced'
            ),
            'random_forest': RandomForestClassifier(
                n_estimators=100, random_state=42, class_weight='balanced'
            ),
            'gradient_boosting': GradientBoostingClassifier(
                n_estimators=100, random_state=42
            ),
            'svm': SVC(
                probability=True, random_state=42, class_weight='balanced'
            ),
            'naive_bayes': GaussianNB()
        }
        
        print(f"Initialized {len(self.models)} traditional ML models")
        
    def train_baseline_models(self, X_train, y_train, X_val, y_val):
        """Train all baseline models"""
        print("Training baseline models...")
        
        trained_models = {}
        
        for name, model in tqdm(self.models.items(), desc="Training models"):
            start_time = time.time()
            
            # Train model
            model.fit(X_train, y_train)
            
            # Make predictions
            train_pred = model.predict(X_train)
            val_pred = model.predict(X_val)
            train_proba = model.predict_proba(X_train)[:, 1] if hasattr(model, 'predict_proba') else None
            val_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, 'predict_proba') else None
            
            # Calculate metrics
            train_metrics = self._calculate_metrics(y_train, train_pred, train_proba)
            val_metrics = self._calculate_metrics(y_val, val_pred, val_proba)
            
            training_time = time.time() - start_time
            
            # Store results
            self.results[name] = {
                'model': model,
                'train_metrics': train_metrics,
                'val_metrics': val_metrics,
                'training_time': training_time
            }
            
            trained_models[name] = model
            
            print(f"{name}: Val Accuracy = {val_metrics['accuracy']:.4f}, "
                  f"Val F1 = {val_metrics['f1']:.4f}, Time = {training_time:.2f}s")
        
        return trained_models
    
    def hyperparameter_optimization(self, X_train, y_train, X_val, y_val):
        """Optimize hyperparameters for best models"""
        print("Starting hyperparameter optimization...")
        
        # Define parameter grids for top models
        param_grids = {
            'random_forest': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2']
            },
            'gradient_boosting': {
                'n_estimators': [100, 200],
                'learning_rate': [0.05, 0.1, 0.15],
                'max_depth': [3, 5, 7],
                'subsample': [0.8, 0.9, 1.0]
            },
            'logistic_regression': {
                'C': [0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
        }
        
        optimized_models = {}
        
        for model_name in ['random_forest', 'gradient_boosting', 'logistic_regression']:
            print(f"Optimizing {model_name}...")
            
            base_model = self.models[model_name]
            param_grid = param_grids[model_name]
            
            # Use RandomizedSearchCV for efficiency
            search = RandomizedSearchCV(
                base_model, 
                param_grid, 
                n_iter=20,  # Limit iterations for speed
                cv=3,
                scoring='f1',
                random_state=42,
                n_jobs=-1
            )
            
            search.fit(X_train, y_train)
            
            # Evaluate best model
            best_model = search.best_estimator_
            val_pred = best_model.predict(X_val)
            val_proba = best_model.predict_proba(X_val)[:, 1]
            val_metrics = self._calculate_metrics(y_val, val_pred, val_proba)
            
            optimized_models[f"{model_name}_optimized"] = {
                'model': best_model,
                'best_params': search.best_params_,
                'val_metrics': val_metrics,
                'cv_score': search.best_score_
            }
            
            print(f"Best {model_name} - Val F1: {val_metrics['f1']:.4f}, "
                  f"CV Score: {search.best_score_:.4f}")
        
        return optimized_models
    
    def create_ensemble(self, models_dict, X_train, y_train, X_val, y_val):
        """Create ensemble of best models"""
        print("Creating ensemble model...")
        
        # Select top 3 models based on validation F1 score
        model_scores = []
        for name, result in self.results.items():
            model_scores.append((name, result['val_metrics']['f1'], result['model']))
        
        model_scores.sort(key=lambda x: x[1], reverse=True)
        top_models = model_scores[:3]
        
        print("Top models for ensemble:")
        for name, score, _ in top_models:
            print(f"  {name}: F1 = {score:.4f}")
        
        # Create voting classifier
        estimators = [(name, model) for name, _, model in top_models]
        ensemble = VotingClassifier(estimators=estimators, voting='soft')
        
        # Train ensemble
        ensemble.fit(X_train, y_train)
        
        # Evaluate ensemble
        val_pred = ensemble.predict(X_val)
        val_proba = ensemble.predict_proba(X_val)[:, 1]
        val_metrics = self._calculate_metrics(y_val, val_pred, val_proba)
        
        print(f"Ensemble - Val Accuracy: {val_metrics['accuracy']:.4f}, "
              f"Val F1: {val_metrics['f1']:.4f}")
        
        return ensemble, val_metrics
    
    def _calculate_metrics(self, y_true, y_pred, y_proba=None):
        """Calculate comprehensive metrics"""
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, average='weighted'),
            'recall': recall_score(y_true, y_pred, average='weighted'),
            'f1': f1_score(y_true, y_pred, average='weighted')
        }
        
        if y_proba is not None:
            metrics['auc_roc'] = roc_auc_score(y_true, y_proba)
        
        return metrics

In [8]:
## 3. Deep Learning Models

class FakeNewsDataset(Dataset):
    """Dataset class for PyTorch training"""
    
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features.values if hasattr(features, 'values') else features)
        self.labels = torch.LongTensor(labels)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class DeepNeuralNetwork(nn.Module):
    """Deep neural network for fake news classification"""
    
    def __init__(self, input_dim, hidden_dims=[512, 256, 128], dropout_rate=0.3):
        super(DeepNeuralNetwork, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, 2))  # 2 classes: fake, real
        
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

class DeepLearningTrainer:
    """Train deep learning models"""
    
    def __init__(self, device=None):
        self.device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
    def train_neural_network(self, X_train, y_train, X_val, y_val, 
                           hidden_dims=[512, 256, 128], epochs=50, batch_size=64):
        """Train deep neural network"""
        print("Training deep neural network...")
        
        # Create datasets
        train_dataset = FakeNewsDataset(X_train, y_train)
        val_dataset = FakeNewsDataset(X_val, y_val)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        # Initialize model
        input_dim = X_train.shape[1]
        model = DeepNeuralNetwork(input_dim, hidden_dims).to(self.device)
        
        # Loss and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
        
        # Training loop
        train_losses = []
        val_losses = []
        val_accuracies = []
        
        best_val_acc = 0
        best_model_state = None
        
        for epoch in range(epochs):
            # Training phase
            model.train()
            train_loss = 0
            for batch_features, batch_labels in train_loader:
                batch_features, batch_labels = batch_features.to(self.device), batch_labels.to(self.device)
                
                optimizer.zero_grad()
                outputs = model(batch_features)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
            
            # Validation phase
            model.eval()
            val_loss = 0
            correct = 0
            total = 0
            
            with torch.no_grad():
                for batch_features, batch_labels in val_loader:
                    batch_features, batch_labels = batch_features.to(self.device), batch_labels.to(self.device)
                    
                    outputs = model(batch_features)
                    loss = criterion(outputs, batch_labels)
                    val_loss += loss.item()
                    
                    _, predicted = torch.max(outputs.data, 1)
                    total += batch_labels.size(0)
                    correct += (predicted == batch_labels).sum().item()
            
            train_loss /= len(train_loader)
            val_loss /= len(val_loader)
            val_acc = correct / total
            
            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_accuracies.append(val_acc)
            
            scheduler.step(val_loss)
            
            # Save best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_model_state = model.state_dict().copy()
            
            if epoch % 10 == 0:
                print(f"Epoch {epoch}/{epochs}: "
                      f"Train Loss: {train_loss:.4f}, "
                      f"Val Loss: {val_loss:.4f}, "
                      f"Val Acc: {val_acc:.4f}")
        
        # Load best model
        model.load_state_dict(best_model_state)
        
        # Final evaluation
        final_metrics = self._evaluate_deep_model(model, val_loader)
        
        training_history = {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'val_accuracies': val_accuracies,
            'best_val_accuracy': best_val_acc
        }
        
        return model, final_metrics, training_history
    
    def _evaluate_deep_model(self, model, data_loader):
        """Evaluate deep learning model"""
        model.eval()
        all_predictions = []
        all_labels = []
        all_probabilities = []
        
        with torch.no_grad():
            for batch_features, batch_labels in data_loader:
                batch_features, batch_labels = batch_features.to(self.device), batch_labels.to(self.device)
                
                outputs = model(batch_features)
                probabilities = F.softmax(outputs, dim=1)
                _, predicted = torch.max(outputs, 1)
                
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(batch_labels.cpu().numpy())
                all_probabilities.extend(probabilities[:, 1].cpu().numpy())  # Probability of class 1
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(all_labels, all_predictions),
            'precision': precision_score(all_labels, all_predictions, average='weighted'),
            'recall': recall_score(all_labels, all_predictions, average='weighted'),
            'f1': f1_score(all_labels, all_predictions, average='weighted'),
            'auc_roc': roc_auc_score(all_labels, all_probabilities)
        }
        
        return metrics

In [9]:
## 4. BERT Training

class BERTDataset(Dataset):
    """Dataset for BERT training"""
    
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class BERTTrainer:
    """Train BERT model for fake news detection"""
    
    def __init__(self, model_name='bert-base-uncased'):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = None
        
    def prepare_datasets(self, train_texts, train_labels, val_texts, val_labels, max_length=512):
        """Prepare datasets for BERT training"""
        print("Preparing BERT datasets...")
        
        train_dataset = BERTDataset(train_texts, train_labels, self.tokenizer, max_length)
        val_dataset = BERTDataset(val_texts, val_labels, self.tokenizer, max_length)
        
        return train_dataset, val_dataset
    
    def train_bert(self, train_dataset, val_dataset, output_dir, epochs=3, batch_size=16):
        """Train BERT model"""
        print("Training BERT model...")
        
        # Initialize model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=2
        )
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir=f'{output_dir}/logs',
            logging_steps=100,
            evaluation_strategy="steps",
            eval_steps=500,
            save_steps=1000,
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            greater_is_better=True,
            save_total_limit=2,
            dataloader_num_workers=4
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=self._compute_metrics,
            data_collator=DataCollatorWithPadding(tokenizer=self.tokenizer)
        )
        
        # Train model
        trainer.train()
        
        # Evaluate
        eval_results = trainer.evaluate()
        
        return self.model, trainer, eval_results
    
    def _compute_metrics(self, eval_pred):
        """Compute metrics for BERT evaluation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        return {
            'accuracy': accuracy_score(labels, predictions),
            'f1': f1_score(labels, predictions, average='weighted'),
            'precision': precision_score(labels, predictions, average='weighted'),
            'recall': recall_score(labels, predictions, average='weighted')
        }

In [10]:
## 5. Model Evaluation and Comparison

class ModelEvaluator:
    """Comprehensive model evaluation"""
    
    def __init__(self):
        self.results = {}
        
    def evaluate_all_models(self, models_dict, X_test, y_test, text_test=None):
        """Evaluate all trained models on test set"""
        print("Evaluating all models on test set...")
        
        evaluation_results = {}
        
        for model_name, model_info in models_dict.items():
            print(f"Evaluating {model_name}...")
            
            if model_name == 'bert' and text_test is not None:
                # Special handling for BERT
                metrics = self._evaluate_bert_model(model_info, text_test, y_test)
            elif model_name == 'deep_nn':
                # Special handling for deep neural network
                metrics = self._evaluate_deep_model(model_info, X_test, y_test)
            else:
                # Traditional ML models
                model = model_info['model'] if isinstance(model_info, dict) else model_info
                metrics = self._evaluate_traditional_model(model, X_test, y_test)
            
            evaluation_results[model_name] = metrics
            
        return evaluation_results
    
    def _evaluate_traditional_model(self, model, X_test, y_test):
        """Evaluate traditional ML model"""
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'f1': f1_score(y_test, y_pred, average='weighted'),
            'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
        }
        
        if y_proba is not None:
            metrics['auc_roc'] = roc_auc_score(y_test, y_proba)
        
        return metrics
    
    def _evaluate_deep_model(self, model_info, X_test, y_test):
        """Evaluate deep learning model"""
        model = model_info['model']
        model.eval()
        
        # Convert to tensors
        X_test_tensor = torch.FloatTensor(X_test.values if hasattr(X_test, 'values') else X_test)
        
        with torch.no_grad():
            outputs = model(X_test_tensor)
            probabilities = F.softmax(outputs, dim=1)
            _, predictions = torch.max(outputs, 1)
        
        y_pred = predictions.numpy()
        y_proba = probabilities[:, 1].numpy()
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'f1': f1_score(y_test, y_pred, average='weighted'),
            'auc_roc': roc_auc_score(y_test, y_proba),
            'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
        }
        
        return metrics
    
    def _evaluate_bert_model(self, model_info, text_test, y_test):
        """Evaluate BERT model"""
        tokenizer = model_info['tokenizer']
        model = model_info['model']
        
        # Tokenize test texts
        test_encodings = tokenizer(
            text_test,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors='pt'
        )
        
        model.eval()
        with torch.no_grad():
            outputs = model(**test_encodings)
            probabilities = F.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(outputs.logits, dim=1)
        
        y_pred = predictions.numpy()
        y_proba = probabilities[:, 1].numpy()
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'f1': f1_score(y_test, y_pred, average='weighted'),
            'auc_roc': roc_auc_score(y_test, y_proba),
            'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
        }
        
        return metrics
    
    def create_comparison_report(self, results):
        """Create comprehensive comparison report"""
        print("\n" + "="*60)
        print("MODEL COMPARISON REPORT")
        print("="*60)
        
        # Create comparison dataframe
        comparison_data = []
        for model_name, metrics in results.items():
            comparison_data.append({
                'Model': model_name,
                'Accuracy': metrics['accuracy'],
                'Precision': metrics['precision'],
                'Recall': metrics['recall'],
                'F1 Score': metrics['f1'],
                'AUC-ROC': metrics.get('auc_roc', 'N/A')
            })
        
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.sort_values('F1 Score', ascending=False)
        
        print(comparison_df.to_string(index=False, float_format='%.4f'))
        
        # Identify best model
        best_model = comparison_df.iloc[0]['Model']
        best_f1 = comparison_df.iloc[0]['F1 Score']
        
        print(f"\n🏆 Best Model: {best_model} (F1 Score: {best_f1:.4f})")
        
        return comparison_df
    
    def plot_model_comparison(self, results, save_path=None):
        """Create visualization of model comparison"""
        # Prepare data for plotting
        models = list(results.keys())
        metrics = ['accuracy', 'precision', 'recall', 'f1']
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        axes = axes.ravel()
        
        for i, metric in enumerate(metrics):
            values = [results[model][metric] for model in models]
            
            bars = axes[i].bar(models, values, alpha=0.7, 
                              color=['skyblue', 'lightcoral', 'lightgreen', 'gold', 'plum'])
            axes[i].set_title(f'{metric.capitalize()} Comparison', fontsize=14, fontweight='bold')
            axes[i].set_ylabel(metric.capitalize())
            axes[i].set_ylim([0, 1])
            axes[i].tick_params(axis='x', rotation=45)
            axes[i].grid(True, alpha=0.3, axis='y')
            
            # Add value labels on bars
            for bar in bars:
                height = bar.get_height()
                axes[i].text(bar.get_x() + bar.get_width()/2., height,
                           f'{height:.3f}',
                           ha='center', va='bottom', fontsize=9)
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Comparison plot saved to: {save_path}")
        
        plt.show()
        
    def plot_confusion_matrices(self, results, save_path=None):
        """Plot confusion matrices for all models"""
        n_models = len(results)
        fig, axes = plt.subplots(2, (n_models + 1) // 2, figsize=(15, 8))
        axes = axes.ravel()
        
        for idx, (model_name, metrics) in enumerate(results.items()):
            if 'confusion_matrix' in metrics:
                cm = np.array(metrics['confusion_matrix'])
                
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                           ax=axes[idx], cbar=True,
                           xticklabels=['Fake', 'Real'],
                           yticklabels=['Fake', 'Real'])
                axes[idx].set_title(f'{model_name}\nConfusion Matrix')
                axes[idx].set_ylabel('True Label')
                axes[idx].set_xlabel('Predicted Label')
        
        # Hide unused subplots
        for idx in range(len(results), len(axes)):
            axes[idx].set_visible(False)
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Confusion matrices saved to: {save_path}")
        
        plt.show()

In [11]:
## 6. Model Persistence and Deployment

class ModelManager:
    """Manage model saving, loading, and versioning"""
    
    def __init__(self, models_path=MODELS_PATH):
        self.models_path = models_path
        
    def save_model(self, model, model_name, metrics=None, metadata=None):
        """Save trained model with metadata"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_dir = os.path.join(self.models_path, f"{model_name}_{timestamp}")
        os.makedirs(model_dir, exist_ok=True)
        
        # Save model
        model_file = os.path.join(model_dir, 'model.pkl')
        
        if isinstance(model, nn.Module):
            # PyTorch model
            torch.save(model.state_dict(), model_file.replace('.pkl', '.pth'))
        else:
            # Scikit-learn or traditional model
            joblib.dump(model, model_file)
        
        # Save metrics
        if metrics:
            metrics_file = os.path.join(model_dir, 'metrics.json')
            with open(metrics_file, 'w') as f:
                json.dump(metrics, f, indent=2)
        
        # Save metadata
        full_metadata = {
            'model_name': model_name,
            'timestamp': timestamp,
            'model_type': type(model).__name__,
            **(metadata or {})
        }
        
        metadata_file = os.path.join(model_dir, 'metadata.json')
        with open(metadata_file, 'w') as f:
            json.dump(full_metadata, f, indent=2)
        
        print(f"Model saved to: {model_dir}")
        return model_dir
    
    def load_model(self, model_path):
        """Load saved model"""
        # Check for PyTorch model
        pth_file = os.path.join(model_path, 'model.pth')
        pkl_file = os.path.join(model_path, 'model.pkl')
        
        if os.path.exists(pth_file):
            # Load PyTorch model (need to reconstruct architecture)
            print("Loading PyTorch model...")
            return torch.load(pth_file)
        elif os.path.exists(pkl_file):
            # Load traditional model
            print("Loading traditional ML model...")
            return joblib.load(pkl_file)
        else:
            raise FileNotFoundError(f"No model file found in {model_path}")
    
    def save_best_model(self, models_dict, comparison_df, criteria='F1 Score'):
        """Save the best performing model"""
        best_model_name = comparison_df.iloc[0]['Model']
        best_model = models_dict[best_model_name]
        
        # Extract model object if wrapped in dict
        if isinstance(best_model, dict):
            model = best_model.get('model', best_model)
            metrics = best_model.get('val_metrics', {})
        else:
            model = best_model
            metrics = {}
        
        metadata = {
            'selection_criteria': criteria,
            'comparison_rank': 1,
            'total_models_compared': len(models_dict)
        }
        
        save_path = self.save_model(model, f"best_{best_model_name}", metrics, metadata)
        
        print(f"\nBest model ({best_model_name}) saved successfully!")
        return save_path

In [12]:
## 7. Complete Training Pipeline

class CompletePipeline:
    """Complete end-to-end training pipeline"""
    
    def __init__(self):
        self.data_loader = DataLoader()
        self.traditional_trainer = TraditionalMLTrainer()
        self.deep_trainer = DeepLearningTrainer()
        self.bert_trainer = BERTTrainer()
        self.evaluator = ModelEvaluator()
        self.model_manager = ModelManager()
        
        self.all_models = {}
        self.results = {}
        
    def run_complete_pipeline(self, train_bert=False, train_deep_nn=True):
        """Run complete training pipeline"""
        print("\n" + "="*70)
        print("STARTING COMPLETE MODEL TRAINING PIPELINE")
        print("="*70 + "\n")
        
        # Step 1: Load data
        print("STEP 1: Loading Data")
        print("-" * 70)
        (X_train, y_train), (X_val, y_val), (X_test, y_test) = self.data_loader.load_training_data()
        
        # Step 2: Train traditional ML models
        print("\nSTEP 2: Training Traditional ML Models")
        print("-" * 70)
        self.traditional_trainer.initialize_models()
        baseline_models = self.traditional_trainer.train_baseline_models(
            X_train, y_train, X_val, y_val
        )
        self.all_models.update(baseline_models)
        
        # Step 3: Hyperparameter optimization
        print("\nSTEP 3: Hyperparameter Optimization")
        print("-" * 70)
        optimized_models = self.traditional_trainer.hyperparameter_optimization(
            X_train, y_train, X_val, y_val
        )
        self.all_models.update({name: info['model'] for name, info in optimized_models.items()})
        
        # Step 4: Create ensemble
        print("\nSTEP 4: Creating Ensemble Model")
        print("-" * 70)
        ensemble_model, ensemble_metrics = self.traditional_trainer.create_ensemble(
            self.all_models, X_train, y_train, X_val, y_val
        )
        self.all_models['ensemble'] = ensemble_model
        
        # Step 5: Train deep neural network
        if train_deep_nn:
            print("\nSTEP 5: Training Deep Neural Network")
            print("-" * 70)
            dnn_model, dnn_metrics, dnn_history = self.deep_trainer.train_neural_network(
                X_train, y_train, X_val, y_val
            )
            self.all_models['deep_nn'] = {'model': dnn_model, 'history': dnn_history}
        
        # Step 6: Train BERT (optional, resource-intensive)
        if train_bert:
            print("\nSTEP 6: Training BERT Model")
            print("-" * 70)
            (train_texts, train_labels), (val_texts, val_labels), (test_texts, test_labels) = \
                self.data_loader.load_raw_text_data()
            
            train_dataset, val_dataset = self.bert_trainer.prepare_datasets(
                train_texts, train_labels, val_texts, val_labels
            )
            
            bert_output_dir = os.path.join(MODELS_PATH, 'bert_model')
            bert_model, bert_trainer, bert_results = self.bert_trainer.train_bert(
                train_dataset, val_dataset, bert_output_dir
            )
            
            self.all_models['bert'] = {
                'model': bert_model,
                'tokenizer': self.bert_trainer.tokenizer,
                'trainer': bert_trainer
            }
        
        # Step 7: Evaluate all models
        print("\nSTEP 7: Evaluating All Models on Test Set")
        print("-" * 70)
        test_results = self.evaluator.evaluate_all_models(
            self.all_models, X_test, y_test
        )
        
        # Step 8: Create comparison report
        print("\nSTEP 8: Creating Comparison Report")
        print("-" * 70)
        comparison_df = self.evaluator.create_comparison_report(test_results)
        
        # Step 9: Save visualizations
        print("\nSTEP 9: Creating Visualizations")
        print("-" * 70)
        
        comparison_plot_path = os.path.join(RESULTS_PATH, 'model_comparison.png')
        self.evaluator.plot_model_comparison(test_results, comparison_plot_path)
        
        confusion_matrix_path = os.path.join(RESULTS_PATH, 'confusion_matrices.png')
        self.evaluator.plot_confusion_matrices(test_results, confusion_matrix_path)
        
        # Step 10: Save best model
        print("\nSTEP 10: Saving Best Model")
        print("-" * 70)
        best_model_path = self.model_manager.save_best_model(
            self.all_models, comparison_df
        )
        
        # Save comparison results
        comparison_file = os.path.join(RESULTS_PATH, 'model_comparison.csv')
        comparison_df.to_csv(comparison_file, index=False)
        print(f"Comparison results saved to: {comparison_file}")
        
        # Final summary
        print("\n" + "="*70)
        print("PIPELINE COMPLETE!")
        print("="*70)
        print(f"Total models trained: {len(self.all_models)}")
        print(f"Best model: {comparison_df.iloc[0]['Model']}")
        print(f"Best F1 Score: {comparison_df.iloc[0]['F1 Score']:.4f}")
        print(f"Results saved to: {RESULTS_PATH}")
        print(f"Models saved to: {MODELS_PATH}")
        
        return {
            'models': self.all_models,
            'test_results': test_results,
            'comparison': comparison_df,
            'best_model_path': best_model_path
        }


In [13]:
## 8. Quick Training Script

def quick_train():
    """Quick training script for immediate use"""
    pipeline = CompletePipeline()
    
    # Run with default settings (no BERT for speed)
    results = pipeline.run_complete_pipeline(
        train_bert=False,  # Set to True if you have GPU and time
        train_deep_nn=True
    )
    
    return results

## 9. Individual Model Training Functions

def train_single_model(model_type='random_forest'):
    """Train a single model quickly"""
    print(f"Training {model_type} model...")
    
    # Load data
    data_loader = DataLoader()
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = data_loader.load_training_data()
    
    # Initialize and train model
    if model_type == 'random_forest':
        model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    elif model_type == 'gradient_boosting':
        model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    elif model_type == 'logistic_regression':
        model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    
    # Train
    model.fit(X_train, y_train)
    
    # Evaluate
    val_pred = model.predict(X_val)
    val_metrics = {
        'accuracy': accuracy_score(y_val, val_pred),
        'precision': precision_score(y_val, val_pred, average='weighted'),
        'recall': recall_score(y_val, val_pred, average='weighted'),
        'f1': f1_score(y_val, val_pred, average='weighted')
    }
    
    print(f"Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"  {metric}: {value:.4f}")
    
    # Save model
    manager = ModelManager()
    save_path = manager.save_model(model, model_type, val_metrics)
    
    return model, val_metrics, save_path

In [14]:
## 10. Usage Instructions

if __name__ == "__main__":
    print("\n" + "="*70)
    print("FAKE NEWS DETECTION MODEL TRAINING")
    print("="*70)
    print("\nAvailable training options:")
    print("\n1. Quick Training (Recommended):")
    print("   results = quick_train()")
    print("\n2. Complete Pipeline:")
    print("   pipeline = CompletePipeline()")
    print("   results = pipeline.run_complete_pipeline(train_bert=False, train_deep_nn=True)")
    print("\n3. Single Model Training:")
    print("   model, metrics, path = train_single_model('random_forest')")
    print("\n4. Custom Training:")
    print("   # Use individual trainer classes for custom workflows")
    print("\nStarting quick training...")
    print("="*70 + "\n")
    
    # Run quick training
    results = quick_train()
    
    print("\nTraining complete! Access results with:")
    print("  - results['models'] - All trained models")
    print("  - results['test_results'] - Test set evaluation")
    print("  - results['comparison'] - Model comparison dataframe")
    print("  - results['best_model_path'] - Path to best model")


FAKE NEWS DETECTION MODEL TRAINING

Available training options:

1. Quick Training (Recommended):
   results = quick_train()

2. Complete Pipeline:
   pipeline = CompletePipeline()
   results = pipeline.run_complete_pipeline(train_bert=False, train_deep_nn=True)

3. Single Model Training:
   model, metrics, path = train_single_model('random_forest')

4. Custom Training:
   # Use individual trainer classes for custom workflows

Starting quick training...

Using device: cuda

STARTING COMPLETE MODEL TRAINING PIPELINE

STEP 1: Loading Data
----------------------------------------------------------------------
Loading training data...
Training set: (3500, 2031)
Validation set: (500, 2031)
Test set: (1000, 2031)
Features: 2031

STEP 2: Training Traditional ML Models
----------------------------------------------------------------------
Initialized 5 traditional ML models
Training baseline models...


Training models:  20%|██        | 1/5 [00:00<00:03,  1.10it/s]

logistic_regression: Val Accuracy = 0.7700, Val F1 = 0.7701, Time = 0.91s


Training models:  40%|████      | 2/5 [00:02<00:04,  1.45s/it]

random_forest: Val Accuracy = 0.8380, Val F1 = 0.8307, Time = 1.83s


Training models:  60%|██████    | 3/5 [00:10<00:08,  4.45s/it]

gradient_boosting: Val Accuracy = 0.8440, Val F1 = 0.8374, Time = 8.01s


Training models:  80%|████████  | 4/5 [01:12<00:26, 26.96s/it]

svm: Val Accuracy = 0.8240, Val F1 = 0.8176, Time = 61.46s


Training models: 100%|██████████| 5/5 [01:12<00:00, 14.49s/it]

naive_bayes: Val Accuracy = 0.7500, Val F1 = 0.7512, Time = 0.26s

STEP 3: Hyperparameter Optimization
----------------------------------------------------------------------
Starting hyperparameter optimization...
Optimizing random_forest...




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/home/ghost/anaconda3/envs/fake_news/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/ghost/anaconda3/envs/fake_news/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/ghost/.local/lib/python3.10/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/home/ghost/anaconda3/envs/fake_news/lib/python3.10/multiprocessing/process.p

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.