## 1. Setup and Imports

In [1]:
# Core libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# XGBoost
import xgboost as xgb

# Optuna for hyperparameter optimization
import optuna

# Utilities
import joblib
from datetime import datetime
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("All libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")
print(f"Optuna version: {optuna.__version__}")

All libraries imported successfully!
XGBoost version: 3.0.4
Optuna version: 4.6.0


## 2. Check GPU Availability

In [2]:
# Check CUDA availability
try:
    import torch
    cuda_available = torch.cuda.is_available()
    if cuda_available:
        print(f"‚úì CUDA is available")
        print(f"‚úì GPU: {torch.cuda.get_device_name(0)}")
        print(f"‚úì CUDA version: {torch.version.cuda}")
    else:
        print("‚ö† CUDA not available, will use CPU")
except:
    print("‚ö† PyTorch not installed, checking XGBoost GPU support directly")
    cuda_available = False

# Set tree method based on GPU availability
TREE_METHOD = 'gpu_hist' if cuda_available else 'hist'
DEVICE = 'cuda' if cuda_available else 'cpu'

print(f"\nXGBoost will use: tree_method='{TREE_METHOD}', device='{DEVICE}'")

‚úì CUDA is available
‚úì GPU: NVIDIA GeForce RTX 4050 Laptop GPU
‚úì CUDA version: 12.8

XGBoost will use: tree_method='gpu_hist', device='cuda'


## 3. Load Engineered Data

In [3]:
# Load engineered datasets
train_df = pd.read_csv('data/data_minihackathon_train_engineered.csv')
test_df = pd.read_csv('data/data_minihackathon_test_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTarget distribution:")
print(train_df['drug_category'].value_counts())
print(f"\nPercentages:")
print(train_df['drug_category'].value_counts(normalize=True) * 100)

# Separate features and target
X = train_df.drop(['drug_category', 'id'], axis=1, errors='ignore')
y = train_df['drug_category']
X_test = test_df.drop(['id'], axis=1, errors='ignore')
test_ids = test_df['id'] if 'id' in test_df.columns else np.arange(len(test_df))

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"\nFeatures shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"\nClass encoding: {dict(enumerate(le.classes_))}")
print(f"\nFeature names: {list(X.columns)}")

Train shape: (1500, 46)
Test shape: (377, 45)

Target distribution:
drug_category
Hallucinogens    691
Stimulants       567
Depressants      242
Name: count, dtype: int64

Percentages:
drug_category
Hallucinogens    46.066667
Stimulants       37.800000
Depressants      16.133333
Name: proportion, dtype: float64

Features shape: (1500, 45)
Test features shape: (377, 45)

Class encoding: {0: 'Depressants', 1: 'Hallucinogens', 2: 'Stimulants'}

Feature names: ['ID', 'Age', 'Education', 'Nscore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS', 'HighRisk_Score', 'SensationRisk_Score', 'EmotionalInstability', 'Social_Risk', 'Behavioral_Disinhibition', 'NE_Balance', 'BigFive_Std', 'RiskFactor_Mean', 'Impulsive_x_SS', 'Nscore_x_SS', 'Escore_x_Oscore', 'Cscore_x_SS', 'Oscore_x_Impulsive', 'Nscore_squared', 'Nscore_cubed', 'Escore_squared', 'Ascore_squared', 'Impulsive_squared', 'Impulsive_cubed', 'SS_squared', 'SS_cubed', 'Impulsive_to_Cscore_ratio', 'SS_to_Cscore_ratio', 'Nscore_to_Ascore_rat

## 4. Cross-Validation Setup

In [None]:
# Configuration
N_FOLDS = 10
RANDOM_STATE = 42
N_TRIALS = 20  # Number of Optuna trials (increase for better optimization)

# Stratified K-Fold for cross-validation
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

print(f"Using {N_FOLDS}-fold stratified cross-validation")
print(f"Random state: {RANDOM_STATE}")
print(f"Optuna trials: {N_TRIALS}")

Using 10-fold stratified cross-validation
Random state: 42
Optuna trials: 50


## 5. XGBoost Hyperparameter Optimization with Optuna

In [5]:
%%time

print("\n" + "="*80)
print("XGBoost Hyperparameter Optimization with Optuna")
print("="*80)

def objective_xgb(trial):
    """Optuna objective function for XGBoost hyperparameter optimization"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': RANDOM_STATE,
        'tree_method': TREE_METHOD,
        'device': DEVICE,
        'eval_metric': 'mlogloss'
    }
    
    # Cross-validation
    model = xgb.XGBClassifier(**params)
    cv_results = cross_validate(model, X, y_encoded, cv=skf, 
                               scoring='accuracy', n_jobs=-1)
    
    return cv_results['test_score'].mean()

# Create and run Optuna study
study = optuna.create_study(direction='maximize', study_name='xgboost_optimization')
study.optimize(objective_xgb, n_trials=N_TRIALS, show_progress_bar=True)

print(f"\n{'='*80}")
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"Best CV Accuracy: {study.best_value:.4f}")
print(f"\nBest parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

[I 2025-11-16 15:26:55,545] A new study created in memory with name: xgboost_optimization



XGBoost Hyperparameter Optimization with Optuna


Best trial: 0. Best value: 0.708:   2%|‚ñè         | 1/50 [00:19<15:50, 19.39s/it]

[I 2025-11-16 15:27:14,930] Trial 0 finished with value: 0.708 and parameters: {'n_estimators': 153, 'max_depth': 6, 'learning_rate': 0.01980403666076474, 'min_child_weight': 8, 'subsample': 0.7595222843319884, 'colsample_bytree': 0.7705417604662863, 'gamma': 0.09193289652122782, 'reg_alpha': 2.018536773073783, 'reg_lambda': 3.972051387668687}. Best is trial 0 with value: 0.708.


Best trial: 1. Best value: 0.716:   4%|‚ñç         | 2/50 [00:48<20:11, 25.24s/it]

[I 2025-11-16 15:27:44,267] Trial 1 finished with value: 0.716 and parameters: {'n_estimators': 305, 'max_depth': 5, 'learning_rate': 0.019171288116434055, 'min_child_weight': 2, 'subsample': 0.6288869932030002, 'colsample_bytree': 0.6005691175115523, 'gamma': 0.458220835655046, 'reg_alpha': 4.408766354278014, 'reg_lambda': 4.867849813651781}. Best is trial 1 with value: 0.716.


Best trial: 1. Best value: 0.716:   6%|‚ñå         | 3/50 [02:01<36:36, 46.73s/it]

[I 2025-11-16 15:28:56,563] Trial 2 finished with value: 0.712 and parameters: {'n_estimators': 926, 'max_depth': 6, 'learning_rate': 0.0037486045046647765, 'min_child_weight': 1, 'subsample': 0.6023313645160633, 'colsample_bytree': 0.9721810776217322, 'gamma': 0.40945713181840415, 'reg_alpha': 4.059071560664168, 'reg_lambda': 4.363298956759931}. Best is trial 1 with value: 0.716.


Best trial: 1. Best value: 0.716:   6%|‚ñå         | 3/50 [02:55<45:46, 58.44s/it]


[W 2025-11-16 15:29:50,847] Trial 3 failed with parameters: {'n_estimators': 729, 'max_depth': 6, 'learning_rate': 0.0017642737489484866, 'min_child_weight': 5, 'subsample': 0.8880673120629803, 'colsample_bytree': 0.8712197112986811, 'gamma': 0.8810732971275737, 'reg_alpha': 0.8973697760086824, 'reg_lambda': 3.0522352767974152} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\HP\anaconda3\envs\Vision\Lib\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<timed exec>", line 25, in objective_xgb
  File "c:\Users\HP\anaconda3\envs\Vision\Lib\site-packages\sklearn\utils\_param_validation.py", line 218, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\anaconda3\envs\Vision\Lib\site-packages\sklearn\model_selection\_validation.py", line 399, in cross_validate
    results = parallel(
              ^^

KeyboardInterrupt: 

## 6. Visualize Optimization History

In [None]:
# Create visualizations directory
os.makedirs('visualizations', exist_ok=True)

# Plot optimization history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Optimization history
trial_numbers = [trial.number for trial in study.trials]
trial_values = [trial.value for trial in study.trials]
best_values = [max(trial_values[:i+1]) for i in range(len(trial_values))]

axes[0].plot(trial_numbers, trial_values, 'o-', alpha=0.6, label='Trial Accuracy')
axes[0].plot(trial_numbers, best_values, 'r-', linewidth=2, label='Best Accuracy')
axes[0].set_xlabel('Trial Number')
axes[0].set_ylabel('CV Accuracy')
axes[0].set_title('Optuna Optimization History')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Parameter importance (if enough trials)
if len(study.trials) >= 10:
    try:
        importance = optuna.importance.get_param_importances(study)
        params = list(importance.keys())
        values = list(importance.values())
        
        axes[1].barh(params, values)
        axes[1].set_xlabel('Importance')
        axes[1].set_title('Hyperparameter Importance')
        axes[1].grid(True, alpha=0.3, axis='x')
    except:
        axes[1].text(0.5, 0.5, 'Insufficient trials for importance analysis', 
                    ha='center', va='center', transform=axes[1].transAxes)
        axes[1].axis('off')
else:
    axes[1].text(0.5, 0.5, 'Insufficient trials for importance analysis', 
                ha='center', va='center', transform=axes[1].transAxes)
    axes[1].axis('off')

plt.tight_layout()
plt.savefig('visualizations/xgboost_optimization_history.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nOptimization visualization saved to: visualizations/xgboost_optimization_history.png")

## 7. Train Final Model with Best Parameters

In [None]:
%%time

print("\n" + "="*80)
print("Training Final XGBoost Model")
print("="*80)

# Create model with best parameters
best_xgb = xgb.XGBClassifier(**study.best_params)

# Evaluate with cross-validation
cv_results = cross_validate(best_xgb, X, y_encoded, cv=skf,
                           scoring=['accuracy', 'f1_macro'], 
                           return_train_score=True)

cv_accuracy = cv_results['test_accuracy'].mean()
cv_std = cv_results['test_accuracy'].std()
f1_macro = cv_results['test_f1_macro'].mean()

print(f"\nCross-Validation Results:")
print(f"  CV Accuracy: {cv_accuracy:.4f} ¬± {cv_std:.4f}")
print(f"  F1-Macro: {f1_macro:.4f}")
print(f"  Fold Accuracies: {cv_results['test_accuracy']}")

# Train on full training data
print(f"\nTraining on full dataset...")
best_xgb.fit(X, y_encoded)

# Training predictions
train_pred = best_xgb.predict(X)
train_accuracy = accuracy_score(y_encoded, train_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"\nClassification Report (Training):")
print(classification_report(y_encoded, train_pred, target_names=le.classes_))

## 8. Confusion Matrix Visualization

In [None]:
# Confusion matrix
cm = confusion_matrix(y_encoded, train_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_,
            cbar_kws={'label': 'Count'})
plt.title('XGBoost - Confusion Matrix (Training)', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)

# Add accuracy text
plt.text(1.5, -0.3, f'Training Accuracy: {train_accuracy:.4f}', 
         fontsize=12, fontweight='bold', ha='center')
plt.text(1.5, -0.5, f'CV Accuracy: {cv_accuracy:.4f} ¬± {cv_std:.4f}', 
         fontsize=12, ha='center')

plt.tight_layout()
plt.savefig('visualizations/xgboost_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix saved to: visualizations/xgboost_confusion_matrix.png")

## 9. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance.head(20).to_string(index=False))

# Visualize top 20 features
plt.figure(figsize=(12, 8))
top_20 = feature_importance.head(20)
plt.barh(top_20['feature'], top_20['importance'])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('XGBoost - Top 20 Feature Importances', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('visualizations/xgboost_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nFeature importance visualization saved to: visualizations/xgboost_feature_importance.png")

# Save full feature importance
feature_importance.to_csv('visualizations/xgboost_feature_importance.csv', index=False)
print("Full feature importance saved to: visualizations/xgboost_feature_importance.csv")

## 10. Generate Test Predictions and Submission File

In [None]:
%%time

print("\n" + "="*80)
print("Generating Test Predictions")
print("="*80)

# Make predictions on test set
test_pred_encoded = best_xgb.predict(X_test)
test_pred = le.inverse_transform(test_pred_encoded)

# Get prediction probabilities
test_pred_proba = best_xgb.predict_proba(X_test)

print(f"\nTest predictions generated: {len(test_pred)}")
print(f"\nPrediction distribution:")
pred_dist = pd.Series(test_pred).value_counts()
print(pred_dist)
print(f"\nPrediction percentages:")
print(pred_dist / len(test_pred) * 100)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'drug_category': test_pred
})

# Save submission
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f"submission_xgboost_{timestamp}.csv"
submission.to_csv(submission_filename, index=False)

print(f"\n{'='*80}")
print(f"SUBMISSION FILE CREATED: {submission_filename}")
print("="*80)
print(f"\nFirst 10 predictions:")
print(submission.head(10).to_string(index=False))
print(f"\nLast 10 predictions:")
print(submission.tail(10).to_string(index=False))

## 11. Save Model and Results

In [None]:
# Create models directory
os.makedirs('models', exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save model
model_filename = f"models/xgboost_model_{timestamp}.pkl"
joblib.dump(best_xgb, model_filename)
print(f"Model saved: {model_filename}")

# Save label encoder
le_filename = f"models/label_encoder_{timestamp}.pkl"
joblib.dump(le, le_filename)
print(f"Label encoder saved: {le_filename}")

# Save optimization study
study_filename = f"models/optuna_study_{timestamp}.pkl"
joblib.dump(study, study_filename)
print(f"Optuna study saved: {study_filename}")

# Save results summary
results = {
    'timestamp': timestamp,
    'cv_accuracy': cv_accuracy,
    'cv_std': cv_std,
    'f1_macro': f1_macro,
    'train_accuracy': train_accuracy,
    'best_params': study.best_params,
    'n_trials': N_TRIALS,
    'n_folds': N_FOLDS,
    'tree_method': TREE_METHOD,
    'device': DEVICE
}

results_filename = f"models/xgboost_results_{timestamp}.pkl"
joblib.dump(results, results_filename)
print(f"Results saved: {results_filename}")

# Save results as JSON for easy viewing
import json
results_json_filename = f"models/xgboost_results_{timestamp}.json"
with open(results_json_filename, 'w') as f:
    json.dump(results, f, indent=2)
print(f"Results (JSON) saved: {results_json_filename}")

## 12. Final Summary

In [None]:
print("\n" + "="*80)
print("XGBOOST TRAINING COMPLETE - SUMMARY")
print("="*80)

print(f"\nüìä PERFORMANCE METRICS:")
print(f"  ‚îú‚îÄ Cross-Validation Accuracy: {cv_accuracy:.4f} ¬± {cv_std:.4f}")
print(f"  ‚îú‚îÄ F1-Macro Score: {f1_macro:.4f}")
print(f"  ‚îî‚îÄ Training Accuracy: {train_accuracy:.4f}")

print(f"\n‚öôÔ∏è  CONFIGURATION:")
print(f"  ‚îú‚îÄ Optimization Trials: {N_TRIALS}")
print(f"  ‚îú‚îÄ Cross-Validation Folds: {N_FOLDS}")
print(f"  ‚îú‚îÄ Tree Method: {TREE_METHOD}")
print(f"  ‚îî‚îÄ Device: {DEVICE}")

print(f"\nüéØ BEST HYPERPARAMETERS:")
for key, value in study.best_params.items():
    if isinstance(value, float):
        print(f"  ‚îú‚îÄ {key}: {value:.6f}")
    else:
        print(f"  ‚îú‚îÄ {key}: {value}")

print(f"\nüìÅ FILES GENERATED:")
print(f"  ‚îú‚îÄ Submission: {submission_filename}")
print(f"  ‚îú‚îÄ Model: {model_filename}")
print(f"  ‚îú‚îÄ Label Encoder: {le_filename}")
print(f"  ‚îú‚îÄ Optuna Study: {study_filename}")
print(f"  ‚îú‚îÄ Results (JSON): {results_json_filename}")
print(f"  ‚îî‚îÄ Visualizations: visualizations/")

print(f"\nüìà PREDICTION DISTRIBUTION:")
for category, count in pred_dist.items():
    percentage = (count / len(test_pred)) * 100
    print(f"  ‚îú‚îÄ {category}: {count} ({percentage:.2f}%)")

print(f"\n{'='*80}")
print(f"‚úÖ READY FOR SUBMISSION: {submission_filename}")
print("="*80)