In [1]:
# modeling notebook enhanced by https://claude.ai/

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import (classification_report, accuracy_score, 
                            confusion_matrix, ConfusionMatrixDisplay, 
                            roc_curve, auc)
from sklearn.preprocessing import label_binarize
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from scipy.stats import randint, uniform
import time
import os

In [2]:
# Create directories for saving results
os.makedirs('results', exist_ok=True)
os.makedirs('results/baseline', exist_ok=True)
os.makedirs('results/hybrid', exist_ok=True)

In [3]:
# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

In [10]:
# Function to run complete modeling pipeline
def run_model_pipeline(dataset_type='baseline'):
    print(f"\n{'='*50}")
    print(f"MODELING PIPELINE FOR {dataset_type.upper()} DATASET")
    print(f"{'='*50}\n")
    
    # Load the appropriate datasets
    train_path = f'../data/train_{dataset_type}.csv'
    test_path = f'../data/test_{dataset_type}.csv'
    
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print(f"Successfully loaded {dataset_type} datasets:")
        print(f"Training shape: {train_df.shape}")
        print(f"Testing shape: {test_df.shape}")
    except FileNotFoundError:
        print(f"Error: Could not find {dataset_type} dataset files. Please check file paths.")
        return
    
    # Define features and target
    X_train = train_df.drop('Event Classification', axis=1)
    y_train = train_df['Event Classification']
    X_test = test_df.drop('Event Classification', axis=1)
    y_test = test_df['Event Classification']
    
    # Ensure test data has same columns as train data
    missing_cols = set(X_train.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0
    X_test = X_test[X_train.columns]
    
    # Encode target
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    
    class_names = le.classes_
    print(f"\nTarget Classes: {class_names}")
    
    # Class distribution
    print("\nTraining class distribution:")
    print(pd.Series(y_train_encoded).value_counts(normalize=True))
    
    # Cross-validation with different models
    results = {}
    
    # 1. Random Forest Cross-Validation
    print("\n--- Random Forest Cross-Validation ---")
    rf_results = cross_validate_model(
        X_train, y_train_encoded,
        RandomForestClassifier(random_state=42, class_weight='balanced'),
        "Random Forest"
    )
    results["Random Forest"] = rf_results
    
    # 2. XGBoost Cross-Validation
    print("\n--- XGBoost Cross-Validation ---")
    xgb_results = cross_validate_model(
        X_train, y_train_encoded,
        XGBClassifier(eval_metric='mlogloss', random_state=42),
        "XGBoost"
    )
    results["XGBoost"] = xgb_results
    
    # 3. CatBoost Cross-Validation
    print("\n--- CatBoost Cross-Validation ---")
    cat_results = cross_validate_model(
        X_train, y_train_encoded,
        CatBoostClassifier(verbose=0, random_state=42),
        "CatBoost"
    )
    results["CatBoost"] = cat_results
    
    # 4. MLP Cross-Validation
    print("\n--- MLP Cross-Validation ---")
    mlp_results = cross_validate_model(
        X_train, y_train_encoded,
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42),
        "MLPClassifier"
    )
    results["MLPClassifier"] = mlp_results
    
    # Summarize cross-validation results
    print("\n--- Cross-Validation Summary ---")
    summary_df = pd.DataFrame({
        'Model': list(results.keys()),
        'Mean Accuracy': [r['mean_accuracy'] for r in results.values()],
        'Std Dev': [r['std_accuracy'] for r in results.values()]
    })
    print(summary_df.sort_values('Mean Accuracy', ascending=False))
    
    # Identify best model
    best_model_name = summary_df.sort_values('Mean Accuracy', ascending=False).iloc[0]['Model']
    print(f"\nBest model based on cross-validation: {best_model_name}")
    
    # Hyperparameter tuning for the best model
    print(f"\n--- Hyperparameter Tuning for {best_model_name} ---")
    best_params, best_score = tune_hyperparameters(X_train, y_train_encoded, best_model_name)
    print(f"Best parameters: {best_params}")
    print(f"Best score: {best_score:.4f}")
    
    # Train final model with best parameters
    print("\n--- Training Final Model ---")
    final_model = train_final_model(X_train, y_train_encoded, best_model_name, best_params)
    
    # Apply SMOTE for final training
    sm = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train_encoded)
    print(f"Original training shape: {X_train.shape}, Resampled shape: {X_train_resampled.shape}")
    
    final_model.fit(X_train_resampled, y_train_resampled)
    
    # Evaluate on test set
    print("\n--- Final Model Evaluation on Test Set ---")
    y_pred = final_model.predict(X_test)
    
    # Convert encoded predictions back to original labels
    y_pred_labels = le.inverse_transform(y_pred)
    y_test_labels = y_test.values
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    report = classification_report(y_test_encoded, y_pred, target_names=class_names)
    print(report)
    
    # Save classification report to file
    report_dict = classification_report(y_test_encoded, y_pred, target_names=class_names, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.to_csv(f'results/{dataset_type}/{best_model_name}_classification_report.csv')
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test_encoded, y_pred)
    plt.figure(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap='Blues', values_format='d')
    plt.title(f'Confusion Matrix - {best_model_name}')
    plt.savefig(f'results/{dataset_type}/{best_model_name}_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Normalized confusion matrix
    plt.figure(figsize=(10, 8))
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=class_names)
    disp.plot(cmap='Blues', values_format='.2f')
    plt.title(f'Normalized Confusion Matrix - {best_model_name}')
    plt.savefig(f'results/{dataset_type}/{best_model_name}_normalized_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Feature importance (if applicable)
    if best_model_name in ["Random Forest", "XGBoost", "CatBoost"]:
        plot_feature_importance(final_model, X_train.columns, best_model_name, dataset_type)
    
    # ROC curve for multiclass
    plot_roc_curve(final_model, X_test, y_test_encoded, class_names, best_model_name, dataset_type)
    
    # Return the results for later comparison
    return {
        'model_name': best_model_name,
        'accuracy': accuracy,
        'model': final_model,
        'report': report_dict,
        'confusion_matrix': cm,
        'class_names': class_names
    }

In [5]:
def cross_validate_model(X, y, model, model_name, n_folds=5):
    """Run cross-validation for a model and return performance metrics"""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    fold_accuracies = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
        print(f"Fold {fold}/{n_folds}...")
        
        # Split data
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y[train_idx], y[valid_idx]
        
        # Apply SMOTE
        sm = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = sm.fit_resample(X_train_fold, y_train_fold)
        
        # Train model
        model.fit(X_train_resampled, y_train_resampled)
        
        # Evaluate
        y_pred = model.predict(X_valid_fold)
        accuracy = accuracy_score(y_valid_fold, y_pred)
        fold_accuracies.append(accuracy)
        
        print(f"  Accuracy: {accuracy:.4f}")
    
    # Summarize results
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    print(f"\n{model_name} - {n_folds}-Fold CV Results:")
    print(f"Mean Accuracy: {mean_accuracy:.4f} (±{std_accuracy:.4f})")
    
    return {
        'fold_accuracies': fold_accuracies,
        'mean_accuracy': mean_accuracy,
        'std_accuracy': std_accuracy
    }

def tune_hyperparameters(X, y, model_name):
    """Tune hyperparameters for the specified model"""
    
    if model_name == "Random Forest":
        model = RandomForestClassifier(random_state=42)
        param_grid = {
            'n_estimators': randint(100, 300),
            'max_depth': randint(5, 30),
            'min_samples_split': randint(2, 10),
            'min_samples_leaf': randint(1, 5),
            'max_features': ['sqrt', 'log2', None]
        }
    
    elif model_name == "XGBoost":
        model = XGBClassifier(eval_metric='mlogloss', random_state=42)
        param_grid = {
            'n_estimators': randint(100, 300),
            'max_depth': randint(3, 15),
            'learning_rate': uniform(0.01, 0.3),
            'subsample': uniform(0.5, 0.5),
            'colsample_bytree': uniform(0.5, 0.5)
        }
    
    elif model_name == "CatBoost":
        model = CatBoostClassifier(verbose=0, random_state=42)
        param_grid = {
            'iterations': randint(100, 500),
            'depth': randint(4, 10),
            'learning_rate': uniform(0.01, 0.3),
            'l2_leaf_reg': uniform(1, 10),
            'border_count': randint(32, 255)
        }
    
    elif model_name == "MLPClassifier":
        model = MLPClassifier(random_state=42)
        param_grid = {
            'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50, 50)],
            'activation': ['relu', 'tanh'],
            'solver': ['adam'],
            'alpha': [1e-5, 1e-4, 1e-3],
            'learning_rate': ['constant', 'adaptive'],
            'max_iter': [300, 500]
        }
    
    else:
        raise ValueError(f"Unknown model: {model_name}")
    
    # Create RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=20,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    # Fit the model
    start_time = time.time()
    random_search.fit(X, y)
    end_time = time.time()
    
    print(f"Hyperparameter tuning completed in {end_time - start_time:.2f} seconds")
    
    return random_search.best_params_, random_search.best_score_

In [6]:
def train_final_model(X, y, model_name, best_params):
    """Train the final model with the best parameters"""
    
    if model_name == "Random Forest":
        model = RandomForestClassifier(random_state=42, **best_params)
    
    elif model_name == "XGBoost":
        model = XGBClassifier(eval_metric='mlogloss', random_state=42, **best_params)
    
    elif model_name == "CatBoost":
        model = CatBoostClassifier(verbose=0, random_state=42, **best_params)
    
    elif model_name == "MLPClassifier":
        model = MLPClassifier(random_state=42, **best_params)
    
    else:
        raise ValueError(f"Unknown model: {model_name}")
    
    return model

In [7]:
def plot_feature_importance(model, feature_names, model_name, dataset_type):
    """Plot feature importances for tree-based models"""
    
    if model_name == "Random Forest":
        importances = model.feature_importances_
    elif model_name == "XGBoost":
        importances = model.feature_importances_
    elif model_name == "CatBoost":
        importances = model.get_feature_importance()
    else:
        return
    
    # Create DataFrame for plotting
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Plot top 15 features
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(15))
    plt.title(f'Top 15 Feature Importances - {model_name}')
    plt.tight_layout()
    plt.savefig(f'results/{dataset_type}/{model_name}_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save importance DataFrame
    importance_df.to_csv(f'results/{dataset_type}/{model_name}_feature_importance.csv', index=False)

In [8]:
def plot_roc_curve(model, X_test, y_test, class_names, model_name, dataset_type):
    """Plot ROC curves for multiclass classification"""
    
    # Binarize the output for ROC curve
    n_classes = len(class_names)
    y_test_bin = label_binarize(y_test, classes=range(n_classes))
    
    # Get prediction probabilities
    try:
        y_score = model.predict_proba(X_test)
        
        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        # Plot all ROC curves
        plt.figure(figsize=(10, 8))
        
        for i in range(n_classes):
            plt.plot(fpr[i], tpr[i], lw=2,
                    label=f'{class_names[i]} (AUC = {roc_auc[i]:.2f})')
        
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curves - {model_name}')
        plt.legend(loc="lower right")
        plt.savefig(f'results/{dataset_type}/{model_name}_roc_curves.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    except (AttributeError, ValueError) as e:
        print(f"Could not generate ROC curve: {e}")

In [11]:
# Run the modeling pipeline for baseline dataset
baseline_results = run_model_pipeline('baseline')


MODELING PIPELINE FOR BASELINE DATASET

Successfully loaded baseline datasets:
Training shape: (76065, 30)
Testing shape: (19017, 30)

Target Classes: ['Class I' 'Class II' 'Class III']

Training class distribution:
1    0.708065
0    0.211516
2    0.080418
Name: proportion, dtype: float64

--- Random Forest Cross-Validation ---
Fold 1/5...
  Accuracy: 0.9919
Fold 2/5...
  Accuracy: 0.9914
Fold 3/5...
  Accuracy: 0.9923
Fold 4/5...
  Accuracy: 0.9906
Fold 5/5...
  Accuracy: 0.9924

Random Forest - 5-Fold CV Results:
Mean Accuracy: 0.9917 (±0.0007)

--- XGBoost Cross-Validation ---
Fold 1/5...
  Accuracy: 0.9911
Fold 2/5...
  Accuracy: 0.9899
Fold 3/5...
  Accuracy: 0.9909
Fold 4/5...
  Accuracy: 0.9894
Fold 5/5...
  Accuracy: 0.9910

XGBoost - 5-Fold CV Results:
Mean Accuracy: 0.9905 (±0.0007)

--- CatBoost Cross-Validation ---
Fold 1/5...
  Accuracy: 0.9907
Fold 2/5...
  Accuracy: 0.9891
Fold 3/5...
  Accuracy: 0.9899
Fold 4/5...
  Accuracy: 0.9889
Fold 5/5...
  Accuracy: 0.9903

Cat

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

In [12]:
# Run the modeling pipeline for hybrid dataset
hybrid_results = run_model_pipeline('hybrid')


MODELING PIPELINE FOR HYBRID DATASET

Successfully loaded hybrid datasets:
Training shape: (76065, 330)
Testing shape: (19017, 330)

Target Classes: ['Class I' 'Class II' 'Class III']

Training class distribution:
1    0.708065
0    0.211516
2    0.080418
Name: proportion, dtype: float64

--- Random Forest Cross-Validation ---
Fold 1/5...
  Accuracy: 0.9918
Fold 2/5...
  Accuracy: 0.9908
Fold 3/5...
  Accuracy: 0.9917
Fold 4/5...
  Accuracy: 0.9911
Fold 5/5...
  Accuracy: 0.9918

Random Forest - 5-Fold CV Results:
Mean Accuracy: 0.9914 (±0.0004)

--- XGBoost Cross-Validation ---
Fold 1/5...
  Accuracy: 0.9930
Fold 2/5...
  Accuracy: 0.9909
Fold 3/5...
  Accuracy: 0.9924
Fold 4/5...
  Accuracy: 0.9913
Fold 5/5...
  Accuracy: 0.9930

XGBoost - 5-Fold CV Results:
Mean Accuracy: 0.9922 (±0.0009)

--- CatBoost Cross-Validation ---
Fold 1/5...
  Accuracy: 0.9927
Fold 2/5...
  Accuracy: 0.9899
Fold 3/5...
  Accuracy: 0.9913
Fold 4/5...
  Accuracy: 0.9909
Fold 5/5...
  Accuracy: 0.9922

CatBo

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

In [13]:
# Compare baseline vs hybrid results
if baseline_results and hybrid_results:
    print("\n" + "="*50)
    print("BASELINE VS HYBRID MODEL COMPARISON")
    print("="*50)
    
    comparison_df = pd.DataFrame({
        'Dataset': ['Baseline', 'Hybrid'],
        'Best Model': [baseline_results['model_name'], hybrid_results['model_name']],
        'Accuracy': [baseline_results['accuracy'], hybrid_results['accuracy']]
    })
    
    print(comparison_df)
    
    # Create comparison bar chart
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Dataset', y='Accuracy', data=comparison_df)
    plt.title('Baseline vs Hybrid Model Performance')
    plt.ylim(0.8, 1.0)  # Set y-axis to focus on the high accuracy range
    plt.savefig('results/baseline_vs_hybrid_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save comparison results
    comparison_df.to_csv('results/baseline_vs_hybrid_comparison.csv', index=False)
    
    # Determine which approach is better
    better_model = 'Hybrid' if hybrid_results['accuracy'] > baseline_results['accuracy'] else 'Baseline'
    improvement = abs(hybrid_results['accuracy'] - baseline_results['accuracy']) * 100
    
    print(f"\nThe {better_model} approach performs better with an improvement of {improvement:.2f}% in accuracy.")
    
    # Create detailed performance comparison
    # Extract precision, recall, f1-score for each class
    baseline_metrics = pd.DataFrame(baseline_results['report']).T
    hybrid_metrics = pd.DataFrame(hybrid_results['report']).T
    
    # Create detailed comparison DataFrame
    detailed_comparison = pd.DataFrame()
    
    for class_name in baseline_results['class_names']:
        for metric in ['precision', 'recall', 'f1-score']:
            baseline_value = baseline_metrics.loc[class_name, metric]
            hybrid_value = hybrid_metrics.loc[class_name, metric]
            
            detailed_comparison.loc[f"{class_name} {metric}", 'Baseline'] = baseline_value
            detailed_comparison.loc[f"{class_name} {metric}", 'Hybrid'] = hybrid_value
            detailed_comparison.loc[f"{class_name} {metric}", 'Difference'] = hybrid_value - baseline_value
    
    # Add overall metrics
    detailed_comparison.loc['Overall accuracy', 'Baseline'] = baseline_results['accuracy']
    detailed_comparison.loc['Overall accuracy', 'Hybrid'] = hybrid_results['accuracy']
    detailed_comparison.loc['Overall accuracy', 'Difference'] = hybrid_results['accuracy'] - baseline_results['accuracy']
    
    print("\nDetailed Performance Comparison:")
    print(detailed_comparison)
    
    # Save detailed comparison
    detailed_comparison.to_csv('results/detailed_performance_comparison.csv')
    
    print("\nAnalysis completed. Results saved to the 'results' directory.")


BASELINE VS HYBRID MODEL COMPARISON
    Dataset     Best Model  Accuracy
0  Baseline  Random Forest  0.989851
1    Hybrid  MLPClassifier  0.992428

The Hybrid approach performs better with an improvement of 0.26% in accuracy.

Detailed Performance Comparison:
                     Baseline    Hybrid  Difference
Class I precision    0.999245  0.996747   -0.002498
Class I recall       0.986574  0.990303    0.003729
Class I f1-score     0.992869  0.993515    0.000646
Class II precision   0.996263  0.995316   -0.000946
Class II recall      0.989752  0.994208    0.004456
Class II f1-score    0.992997  0.994762    0.001765
Class III precision  0.916067  0.956688    0.040621
Class III recall     0.999346  0.982341   -0.017005
Class III f1-score   0.955896  0.969345    0.013449
Overall accuracy     0.989851  0.992428    0.002577

Analysis completed. Results saved to the 'results' directory.
