In [11]:
import xgboost as xgb
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report,
                             ConfusionMatrixDisplay, roc_curve, auc)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

In [12]:
# Load and shuffle data to prevent ordered outcomes
data = pd.read_csv('D:\\MOOC.fi\\Jupyter\\EEG\\Dataset\\timefeature_data_final.csv')
data = data.sample(frac=1, random_state=42).reset_index(drop=True)  # Critical shuffle

# Verify class distribution after shuffling
print("Class distribution after shuffling:")
print(data['target'].value_counts(normalize=True))

# Split features and target
X = data.drop('target', axis=1)
y = data['target']

Class distribution after shuffling:
target
1    0.500061
0    0.499939
Name: proportion, dtype: float64


In [13]:
# First split: 80% train-val, 20% test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Create validation set from trainval for early stopping
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval,
    test_size=0.2,
    stratify=y_trainval,
    random_state=42
)

print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")

Train size: 10484, Val size: 2621, Test size: 3277


In [14]:
# Calculate class weight for imbalance handling
class_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])

# Configure pipeline with proper data flow
pipeline = Pipeline([
    ('xgb', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=1000,
        early_stopping_rounds=50,
        random_state=42,
        tree_method='hist',
        device='cuda'
    ))
])

# Enhanced parameter grid
param_grid = {
    'xgb__learning_rate': [0.01, 0.05],
    'xgb__max_depth': [3, 5, 7],
    'xgb__subsample': [0.6, 0.8],
    'xgb__colsample_bytree': [0.6, 0.8],
    'xgb__gamma': [0, 0.1],
    'xgb__reg_alpha': [0, 0.1],
    'xgb__reg_lambda': [0, 0.1],
    'xgb__scale_pos_weight': [1, class_weight]
}

In [None]:
# Configure grid search with inner CV
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=inner_cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)


# Fit with validation set for early stopping
grid_search.fit(
    X_train, y_train,
    xgb__eval_set=[(X_val, y_val)]
)

best_model = grid_search.best_estimator_

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    print("\n=== Final Test Set Performance ===")
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap='Blues')
    plt.title('Test Set Confusion Matrix')
    plt.show()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.title('Test Set ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    return y_proba

y_proba = evaluate_model(best_model, X_test, y_test)

In [None]:
def plot_feature_importance(model):
    # Access scaler from pipeline
    scaled_features = model.named_steps['scaler'].get_feature_names_out(X.columns)
    
    # Get feature importances
    importances = model.named_steps['xgb'].feature_importances_
    
    importance_df = pd.DataFrame({
        'Feature': scaled_features,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    plt.barh(importance_df['Feature'][:20], importance_df['Importance'][:20])
    plt.title('Top 20 Feature Importances')
    plt.xlabel('Gain Importance')
    plt.gca().invert_yaxis()
    plt.show()

def shap_analysis(model, X_sample):
    # Process sample data through pipeline steps
    scaled_data = model.named_steps['scaler'].transform(X_sample)
    
    explainer = shap.TreeExplainer(model.named_steps['xgb'])
    shap_values = explainer.shap_values(scaled_data)
    
    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values, scaled_data, feature_names=X.columns, show=False)
    plt.title('SHAP Value Distribution')
    plt.show()

# Use subset for SHAP analysis
shap_sample = X_test.sample(n=500, random_state=42)
shap_analysis(best_model, shap_sample)
plot_feature_importance(best_model)

In [None]:
def plot_learning_curve(model):
    results = model.named_steps['xgb'].evals_result()
    plt.figure(figsize=(12, 6))
    plt.plot(results['validation_0']['logloss'], label='Validation Loss')
    plt.xlabel('Iterations')
    plt.ylabel('Log Loss')
    plt.title('Learning Curve')
    plt.legend()
    plt.show()

plot_learning_curve(best_model)

In [None]:
print("\n=== Optimal Parameters ===")
print(grid_search.best_params_)

# Retrain on combined train-val data
final_model = grid_search.best_estimator_.fit(X_trainval, y_trainval)