# Disease Prediction - Model Training and Evaluation

This notebook implements comprehensive model training with hyperparameter optimization, cross-validation, and model evaluation for the disease prediction hackathon.

## 1. Setup and Data Loading

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
import os
import joblib
import json
from datetime import datetime

# Add src to path
sys.path.append('../src')

from models.train import DiseasePredictor
from utils.helpers import *

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           classification_report, confusion_matrix, roc_auc_score)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
import optuna

# Settings
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

# Set random seeds for reproducibility
np.random.seed(42)

print("🚀 Model training environment setup completed!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

🚀 Model training environment setup completed!
Timestamp: 2025-09-09 11:28:34


In [10]:
# Load the raw data directly and recreate preprocessor
print("Loading data and recreating preprocessor...")

# Load raw data
train_data_raw = pd.read_csv('../data/raw/Training.csv')
test_data_raw = pd.read_csv('../data/raw/Testing.csv')

print(f"Raw training data shape: {train_data_raw.shape}")
print(f"Raw test data shape: {test_data_raw.shape}")

# Get symptom columns (all except 'prognosis')
symptom_columns = [col for col in train_data_raw.columns if col != 'prognosis']
print(f"Number of symptom features: {len(symptom_columns)}")

# Check unique diseases
train_diseases = set(train_data_raw['prognosis'].unique())
test_diseases = set(test_data_raw['prognosis'].unique())
all_diseases = sorted(list(train_diseases.union(test_diseases)))

print(f"Number of unique diseases in training: {len(train_diseases)}")
print(f"Number of unique diseases in test: {len(test_diseases)}")
print(f"Total unique diseases: {len(all_diseases)}")

# Create new label encoder
label_encoder = LabelEncoder()
label_encoder.fit(all_diseases)  # Fit on all diseases

print(f"Label encoder fitted on {len(label_encoder.classes_)} disease classes")
print(f"Disease classes: {list(label_encoder.classes_)[:10]}...")

Loading data and recreating preprocessor...
Raw training data shape: (4920, 133)
Raw test data shape: (42, 133)
Number of symptom features: 132
Number of unique diseases in training: 42
Number of unique diseases in test: 42
Total unique diseases: 42
Label encoder fitted on 42 disease classes
Disease classes: ['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma', 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis']...


## 2. Feature Engineering and Preparation

In [5]:
# Prepare features and targets
X, y = preprocessor.prepare_features_and_target(train_data)
X_test, y_test = preprocessor.prepare_features_and_target(test_data)

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Test matrix shape: {X_test.shape}")

# Feature selection based on EDA insights
print(f"\n🔍 Applying feature selection...")

# Use chi-square test for feature selection
selector = SelectKBest(score_func=chi2, k=100)  # Select top 100 features
X_selected = selector.fit_transform(X, y)
X_test_selected = selector.transform(X_test)

selected_features = selector.get_support()
selected_feature_names = [name for name, selected in zip(preprocessor.get_symptom_names(), selected_features) if selected]

print(f"Selected {len(selected_feature_names)} features out of {len(preprocessor.get_symptom_names())}")
print(f"Selected features shape: {X_selected.shape}")

# Use selected features
X_train_final = X_selected
X_test_final = X_test_selected

ValueError: y contains previously unseen labels: 15

In [4]:
# Create train-validation split
print("Creating train-validation split...")

X_train, X_val, y_train, y_val = train_test_split(
    X_train_final, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test_final.shape}")

# Check class distribution in splits
from collections import Counter

train_dist = Counter(y_train)
val_dist = Counter(y_val)
test_dist = Counter(y_test)

print(f"\nClass distribution (showing first 5 classes):")
for i in range(min(5, len(train_dist))):
    disease_name = preprocessor.decode_predictions([i])[0][:20]  # Truncate long names
    print(f"  {disease_name}: Train={train_dist[i]}, Val={val_dist[i]}, Test={test_dist[i]}")

Creating train-validation split...


NameError: name 'X_train_final' is not defined

## 3. Baseline Models Training

In [6]:
# Initialize the disease predictor
predictor = DiseasePredictor()
predictor.initialize_models()

print("🔥 Starting baseline model training...")
print("This may take several minutes...")

# Train and evaluate all models
results = predictor.train_and_evaluate_models(
    X_train, y_train, X_val, y_val, cv_folds=5
)

print(f"\n✅ Baseline training completed!")
print(f"Best baseline model: {predictor.best_model_name}")
print(f"Best CV score: {predictor.best_score:.4f}")

Initialized 10 models for training
🔥 Starting baseline model training...
This may take several minutes...


NameError: name 'X_train' is not defined

In [7]:
# Display detailed results
print("📊 BASELINE MODEL RESULTS:")
print("=" * 60)

results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Validation_Accuracy': [results[model]['accuracy'] for model in results],
    'CV_Mean': [results[model]['cv_mean'] for model in results],
    'CV_Std': [results[model]['cv_std'] for model in results]
}).sort_values('CV_Mean', ascending=False)

for _, row in results_df.iterrows():
    print(f"{row['Model']:20s} | Val Acc: {row['Validation_Accuracy']:.4f} | "
          f"CV: {row['CV_Mean']:.4f} ± {row['CV_Std']:.4f}")

# Visualize results
fig = px.bar(
    results_df, 
    x='CV_Mean', 
    y='Model',
    orientation='h',
    title='Baseline Model Performance (Cross-Validation Accuracy)',
    error_x='CV_Std',
    height=600
)
fig.update_layout(xaxis_title='Cross-Validation Accuracy', yaxis={'categoryorder': 'total ascending'})
fig.show()

📊 BASELINE MODEL RESULTS:


NameError: name 'results' is not defined

## 4. Hyperparameter Optimization

In [None]:
# Hyperparameter optimization for top 3 models
top_3_models = results_df.head(3)['Model'].tolist()

print(f"🎯 Starting hyperparameter optimization for top 3 models: {top_3_models}")

optimized_results = {}

# Only optimize models that support it
optimizable_models = ['Random Forest', 'XGBoost', 'CatBoost']

for model_name in top_3_models:
    if model_name in optimizable_models:
        print(f"\n🔧 Optimizing {model_name}...")
        
        try:
            optimized_model, best_params = predictor.optimize_hyperparameters(
                X_train, y_train, model_name=model_name, n_trials=30
            )
            
            # Evaluate optimized model
            y_val_pred = optimized_model.predict(X_val)
            val_accuracy = accuracy_score(y_val, y_val_pred)
            
            optimized_results[f"{model_name} (Optimized)"] = {
                'model': optimized_model,
                'params': best_params,
                'val_accuracy': val_accuracy,
                'cv_score': predictor.best_score if model_name in predictor.best_model_name else 0
            }
            
            print(f"✅ {model_name} optimization completed!")
            print(f"   Validation accuracy: {val_accuracy:.4f}")
            print(f"   Best parameters: {best_params}")
            
        except Exception as e:
            print(f"❌ Error optimizing {model_name}: {e}")
    else:
        print(f"⏩ Skipping {model_name} (optimization not implemented)")

print(f"\n🎉 Hyperparameter optimization completed!")

## 5. Ensemble Model Creation

In [None]:
# Create ensemble model
print("🤝 Creating ensemble model...")

try:
    ensemble_model = predictor.create_ensemble_model(X_train, y_train)
    
    # Evaluate ensemble on validation set
    y_val_pred_ensemble = ensemble_model.predict(X_val)
    ensemble_val_accuracy = accuracy_score(y_val, y_val_pred_ensemble)
    
    print(f"✅ Ensemble model created successfully!")
    print(f"   Validation accuracy: {ensemble_val_accuracy:.4f}")
    print(f"   CV score: {predictor.best_score:.4f}")
    
    # Add to optimized results
    optimized_results['Ensemble'] = {
        'model': ensemble_model,
        'val_accuracy': ensemble_val_accuracy,
        'cv_score': predictor.best_score
    }
    
except Exception as e:
    print(f"❌ Error creating ensemble: {e}")

## 6. Final Model Selection and Evaluation

In [None]:
# Compare all models (baseline + optimized + ensemble)
print("🏆 FINAL MODEL COMPARISON:")
print("=" * 60)

final_comparison = []

# Add baseline results
for model_name, result in results.items():
    final_comparison.append({
        'Model': model_name,
        'Type': 'Baseline',
        'CV_Score': result['cv_mean'],
        'Val_Accuracy': result['accuracy']
    })

# Add optimized results
for model_name, result in optimized_results.items():
    model_type = 'Ensemble' if model_name == 'Ensemble' else 'Optimized'
    final_comparison.append({
        'Model': model_name,
        'Type': model_type,
        'CV_Score': result.get('cv_score', 0),
        'Val_Accuracy': result['val_accuracy']
    })

comparison_df = pd.DataFrame(final_comparison).sort_values('Val_Accuracy', ascending=False)

print("Model Ranking by Validation Accuracy:")
for i, (_, row) in enumerate(comparison_df.iterrows(), 1):
    print(f"{i:2d}. {row['Model']:25s} ({row['Type']:9s}) | "
          f"Val: {row['Val_Accuracy']:.4f} | CV: {row['CV_Score']:.4f}")

# Select final model (best validation accuracy)
best_model_info = comparison_df.iloc[0]
final_model_name = best_model_info['Model']

# Get the actual model object
if best_model_info['Type'] == 'Baseline':
    final_model = results[final_model_name]['model']
else:
    final_model = optimized_results[final_model_name]['model']

print(f"\n🎯 FINAL MODEL SELECTED: {final_model_name}")
print(f"   Type: {best_model_info['Type']}")
print(f"   Validation Accuracy: {best_model_info['Val_Accuracy']:.4f}")
print(f"   Cross-Validation Score: {best_model_info['CV_Score']:.4f}")

In [None]:
# Comprehensive evaluation of final model
print(f"\n🔬 COMPREHENSIVE EVALUATION OF {final_model_name}")
print("=" * 60)

# Predictions on all sets
y_train_pred = final_model.predict(X_train)
y_val_pred = final_model.predict(X_val)
y_test_pred = final_model.predict(X_test_final)

# Calculate metrics for all sets
train_metrics = calculate_metrics(y_train, y_train_pred)
val_metrics = calculate_metrics(y_val, y_val_pred)
test_metrics = calculate_metrics(y_test, y_test_pred)

print("\n📈 PERFORMANCE METRICS:")
print(f"{'Metric':15s} | {'Train':8s} | {'Val':8s} | {'Test':8s}")
print("-" * 50)
for metric in ['accuracy', 'precision', 'recall', 'f1_score']:
    print(f"{metric.title():15s} | {train_metrics[metric]:8.4f} | "
          f"{val_metrics[metric]:8.4f} | {test_metrics[metric]:8.4f}")

# Check for overfitting
overfitting_score = train_metrics['accuracy'] - val_metrics['accuracy']
if overfitting_score > 0.05:
    print(f"\n⚠️  Warning: Potential overfitting detected (train-val gap: {overfitting_score:.4f})")
else:
    print(f"\n✅ Good generalization (train-val gap: {overfitting_score:.4f})")

# Detailed classification report
print_classification_report(y_test, y_test_pred, target_names=preprocessor.get_disease_names())

In [None]:
# Visualize performance comparison
metrics_comparison = pd.DataFrame({
    'Train': [train_metrics[m] for m in ['accuracy', 'precision', 'recall', 'f1_score']],
    'Validation': [val_metrics[m] for m in ['accuracy', 'precision', 'recall', 'f1_score']],
    'Test': [test_metrics[m] for m in ['accuracy', 'precision', 'recall', 'f1_score']]
}, index=['Accuracy', 'Precision', 'Recall', 'F1-Score'])

fig = px.bar(
    metrics_comparison.T, 
    title=f'Performance Metrics Comparison - {final_model_name}',
    height=500
)
fig.update_layout(
    xaxis_title='Dataset',
    yaxis_title='Score',
    legend_title='Metrics'
)
fig.show()

## 7. Feature Importance Analysis

In [None]:
# Get feature importance from final model
print("🔍 FEATURE IMPORTANCE ANALYSIS:")
print("=" * 50)

try:
    feature_importance = predictor.get_feature_importance(selected_feature_names)
    
    if feature_importance is not None:
        print("Top 20 most important features:")
        for i, (_, row) in enumerate(feature_importance.head(20).iterrows(), 1):
            print(f"{i:2d}. {row['feature']:25s}: {row['importance']:.4f}")
        
        # Visualize feature importance
        top_20_features = feature_importance.head(20)
        
        fig = px.bar(
            top_20_features,
            x='importance',
            y='feature',
            orientation='h',
            title='Top 20 Feature Importance',
            height=600
        )
        fig.update_layout(yaxis={'categoryorder': 'total ascending'})
        fig.show()
    else:
        print("Feature importance not available for this model type.")
        
except Exception as e:
    print(f"Error getting feature importance: {e}")

## 8. Error Analysis

In [None]:
# Analyze prediction errors
print("🔍 ERROR ANALYSIS:")
print("=" * 40)

# Find misclassified samples in test set
misclassified = y_test != y_test_pred
correct_classified = y_test == y_test_pred

print(f"Total test samples: {len(y_test)}")
print(f"Correctly classified: {correct_classified.sum()} ({(correct_classified.sum()/len(y_test)):.1%})")
print(f"Misclassified: {misclassified.sum()} ({(misclassified.sum()/len(y_test)):.1%})")

if misclassified.sum() > 0:
    print("\nMisclassified samples analysis:")
    
    misclassified_df = pd.DataFrame({
        'True_Disease': preprocessor.decode_predictions(y_test[misclassified]),
        'Predicted_Disease': preprocessor.decode_predictions(y_test_pred[misclassified])
    })
    
    print(f"Unique misclassification patterns: {len(misclassified_df)}")
    
    if len(misclassified_df) <= 10:
        for i, (_, row) in enumerate(misclassified_df.iterrows(), 1):
            print(f"{i}. True: {row['True_Disease'][:20]} → Predicted: {row['Predicted_Disease'][:20]}")
    else:
        print("First 5 misclassifications:")
        for i, (_, row) in enumerate(misclassified_df.head(5).iterrows(), 1):
            print(f"{i}. True: {row['True_Disease'][:20]} → Predicted: {row['Predicted_Disease'][:20]}")

In [None]:
# Confusion matrix visualization (for a subset of classes)
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Get confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
disease_names = preprocessor.get_disease_names()

print(f"\nConfusion Matrix Analysis:")
print(f"Matrix shape: {cm.shape}")
print(f"Perfect predictions (diagonal): {np.trace(cm)}")
print(f"Total predictions: {np.sum(cm)}")

# Since we have many classes, let's show a subset or summary
if len(disease_names) <= 15:
    # Show full confusion matrix if not too many classes
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[name[:10] for name in disease_names],
                yticklabels=[name[:10] for name in disease_names])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
else:
    # Show accuracy per class
    class_accuracies = []
    for i in range(len(disease_names)):
        if cm[i].sum() > 0:  # Avoid division by zero
            accuracy = cm[i, i] / cm[i].sum()
            class_accuracies.append((disease_names[i], accuracy, cm[i].sum()))
    
    class_accuracies.sort(key=lambda x: x[1])  # Sort by accuracy
    
    print("\nPer-class accuracy (worst performing classes):")
    for disease, acc, samples in class_accuracies[:10]:
        print(f"{disease[:25]:25s}: {acc:.3f} ({samples} samples)")
    
    print("\nPer-class accuracy (best performing classes):")
    for disease, acc, samples in class_accuracies[-10:]:
        print(f"{disease[:25]:25s}: {acc:.3f} ({samples} samples)")

## 9. Model Persistence and Final Artifacts

In [None]:
# Save the final model and all necessary artifacts
print("💾 SAVING MODEL ARTIFACTS:")
print("=" * 40)

# Update preprocessor with feature selection information
preprocessor.selected_features = selected_features
preprocessor.selected_feature_names = selected_feature_names
preprocessor.feature_selector = selector

# Save the best model with all necessary information
predictor.best_model = final_model
predictor.best_model_name = final_model_name
predictor.best_score = best_model_info['Val_Accuracy']

predictor.save_best_model('../models/best_model.pkl', preprocessor)

print("✅ Model saved successfully!")

# Save feature selection artifacts
joblib.dump(selector, '../models/feature_selector.pkl')
print("✅ Feature selector saved!")

# Save updated preprocessor
joblib.dump(preprocessor, '../models/preprocessor.pkl')
print("✅ Updated preprocessor saved!")

In [None]:
# Create model summary report
model_summary = {
    'model_info': {
        'name': final_model_name,
        'type': best_model_info['Type'],
        'training_date': datetime.now().isoformat(),
        'python_version': sys.version
    },
    'data_info': {
        'training_samples': len(train_data),
        'test_samples': len(test_data),
        'total_features': len(preprocessor.get_symptom_names()),
        'selected_features': len(selected_feature_names),
        'classes': len(preprocessor.get_disease_names())
    },
    'performance': {
        'test_accuracy': float(test_metrics['accuracy']),
        'test_precision': float(test_metrics['precision']),
        'test_recall': float(test_metrics['recall']),
        'test_f1_score': float(test_metrics['f1_score']),
        'validation_accuracy': float(val_metrics['accuracy']),
        'cross_validation_score': float(best_model_info['CV_Score'])
    },
    'model_diagnostics': {
        'overfitting_score': float(overfitting_score),
        'correctly_classified': int(correct_classified.sum()),
        'misclassified': int(misclassified.sum()),
        'error_rate': float(misclassified.sum() / len(y_test))
    }
}

# Add feature importance if available
if feature_importance is not None:
    model_summary['top_features'] = feature_importance.head(10).to_dict('records')

# Save model summary
with open('../models/model_summary.json', 'w') as f:
    json.dump(model_summary, f, indent=2)

print("✅ Model summary saved to '../models/model_summary.json'")

# Save test predictions for submission
test_predictions_df = pd.DataFrame({
    'prognosis': y_test_pred
})

test_predictions_df.to_csv('../predictions.csv', index=False)
print("✅ Test predictions saved to '../predictions.csv'")

## 10. Training Summary and Insights

In [None]:
# Final training summary
print("🎉 MODEL TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 60)

print(f"🏆 FINAL MODEL: {final_model_name}")
print(f"📊 TEST ACCURACY: {test_metrics['accuracy']:.4f} ({test_metrics['accuracy']:.1%})")
print(f"🎯 TEST F1-SCORE: {test_metrics['f1_score']:.4f}")
print(f"⚡ CROSS-VALIDATION: {best_model_info['CV_Score']:.4f}")

print(f"\n📈 MODEL STATISTICS:")
print(f"   • Training samples: {len(train_data):,}")
print(f"   • Features used: {len(selected_feature_names)} / {len(preprocessor.get_symptom_names())}")
print(f"   • Classes predicted: {len(preprocessor.get_disease_names())}")
print(f"   • Correct predictions: {correct_classified.sum()} / {len(y_test)}")

print(f"\n🔧 TECHNICAL DETAILS:")
print(f"   • Model type: {type(final_model).__name__}")
print(f"   • Feature selection: Chi-square (top {len(selected_feature_names)})")
print(f"   • Cross-validation: 5-fold stratified")
print(f"   • Overfitting check: {overfitting_score:.4f} ({'✅ Good' if overfitting_score < 0.05 else '⚠️ Monitor'})")

print(f"\n💡 KEY INSIGHTS:")
if test_metrics['accuracy'] > 0.95:
    print("   • Excellent model performance achieved!")
elif test_metrics['accuracy'] > 0.90:
    print("   • Very good model performance achieved!")
elif test_metrics['accuracy'] > 0.85:
    print("   • Good model performance achieved!")
else:
    print("   • Model performance could be improved further.")

if len(selected_feature_names) < len(preprocessor.get_symptom_names()) * 0.8:
    print("   • Feature selection successfully reduced dimensionality")

if overfitting_score < 0.02:
    print("   • Model shows excellent generalization")

print(f"\n🚀 READY FOR DEPLOYMENT!")
print("   All model artifacts have been saved and the model is ready")
print("   to be integrated into the Streamlit application.")

print(f"\n📁 GENERATED FILES:")
print("   • ../models/best_model.pkl (trained model + metadata)")
print("   • ../models/preprocessor.pkl (updated with feature selection)")
print("   • ../models/feature_selector.pkl (feature selection object)")
print("   • ../models/model_summary.json (comprehensive model report)")
print("   • ../predictions.csv (test predictions for submission)")

print(f"\n⏱️  Training completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")