In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# 1. IMPROVED DATA LOADING AND VALIDATION
print("=== Enhanced Disease Prediction Model ===\n")

# Load datasets
df_train = pd.read_csv(r"C:\Users\mysel\Downloads\archive\Training.csv")
df_test = pd.read_csv(r"C:\Users\mysel\Downloads\archive\Testing.csv")

print(f"Training data: {df_train.shape}")
print(f"Test data: {df_test.shape}")

# Remove unnecessary column
df_train = df_train.drop(columns=['Unnamed: 133'], errors='ignore')

# 2. DATA QUALITY CHECKS
print("\n=== Data Quality Analysis ===")

# Check disease distribution
disease_counts = df_train['prognosis'].value_counts()
print("\nDisease distribution in training data:")
print(disease_counts)

# Check if any disease has too few samples
rare_diseases = disease_counts[disease_counts < 10]
if len(rare_diseases) > 0:
    print(f"\nRare diseases (<10 samples): {len(rare_diseases)}")
    print(rare_diseases)

# 3. ENHANCED FEATURE ENGINEERING
print("\n=== Feature Engineering ===")

# Analyze symptom frequency
symptom_frequency = df_train.drop('prognosis', axis=1).sum().sort_values(ascending=False)
print(f"\nMost common symptoms:")
print(symptom_frequency.head(10))

# Remove overly common/rare symptoms (potential noise)
common_threshold = len(df_train) * 0.8  # Symptoms in >80% of cases
rare_threshold = len(df_train) * 0.01   # Symptoms in <1% of cases

too_common = symptom_frequency[symptom_frequency > common_threshold].index
too_rare = symptom_frequency[symptom_frequency < rare_threshold].index

print(f"\nRemoving {len(too_common)} too common symptoms")
print(f"Removing {len(too_rare)} too rare symptoms")

columns_to_drop = list(too_common) + list(too_rare) + ['yellow_crust_ooze', 'red_sore_around_nose', 'blister']
df_train_clean = df_train.drop(columns=columns_to_drop, errors='ignore')
df_test_clean = df_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Features after cleaning: {df_train_clean.shape[1] - 1}")

# 4. PREPARE DATA WITH BETTER VALIDATION
X_train = df_train_clean.drop('prognosis', axis=1)
y_train = df_train_clean['prognosis']
X_test = df_test_clean.drop('prognosis', axis=1)
y_test = df_test_clean['prognosis']

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

print(f"\nNumber of classes: {len(le.classes_)}")
print(f"Final feature count: {X_train.shape[1]}")

# 5. IMPROVED FEATURE SELECTION
print("\n=== Enhanced Feature Selection ===")

# Use fewer but more meaningful features
k_features = min(50, X_train.shape[1])  # Reduced from 80
selector = SelectKBest(mutual_info_classif, k=k_features)
X_train_selected = selector.fit_transform(X_train, y_train_encoded)
X_test_selected = selector.transform(X_test)

selected_features = X_train.columns[selector.get_support()]
print(f"Selected {len(selected_features)} features")

# 6. ENHANCED MODEL TRAINING WITH BETTER PARAMETERS
print("\n=== Improved Model Training ===")

# Calculate class weights
class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_train_encoded), 
    y=y_train_encoded
)
class_weight_dict = dict(enumerate(class_weights))

# Updated models with better parameters
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200, 
        max_depth=15,
        min_samples_split=5,
        class_weight=class_weight_dict, 
        random_state=42
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        max_depth=6,
        random_state=42
    ),
    'SVM': SVC(
        class_weight='balanced', 
        probability=True,  # Enable probability for confidence scores
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        max_iter=1000, 
        class_weight='balanced', 
        random_state=42,
        C=0.1  # More regularization
    ),
    'K-Nearest Neighbors': KNeighborsClassifier(
        n_neighbors=5,
        weights='distance'
    ),
    'Decision Tree': DecisionTreeClassifier(
        max_depth=10,
        min_samples_split=10,
        class_weight=class_weight_dict, 
        random_state=42
    )
}

# 7. COMPREHENSIVE MODEL EVALUATION
print("\n=== Model Evaluation ===\n")

results = {}
detailed_reports = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Cross-validation for better evaluation
    cv_scores = cross_val_score(model, X_train_selected, y_train_encoded, cv=5)
    
    # Train model
    model.fit(X_train_selected, y_train_encoded)
    
    # Predictions
    y_pred = model.predict(X_test_selected)
    y_pred_proba = model.predict_proba(X_test_selected) if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_encoded, y_pred)
    precision = precision_score(y_test_encoded, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test_encoded, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test_encoded, y_pred, average='weighted', zero_division=0)
    
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"{name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print()

# 8. MODEL COMPARISON
print("="*60)
print("FINAL MODEL COMPARISON")
print("="*60)

comparison_df = pd.DataFrame(results).T
print("\n" + comparison_df.round(4).to_string())

# Select best model based on multiple metrics
best_model_name = comparison_df['f1'].idxmax()
best_model = models[best_model_name]
print(f"\nüéØ BEST MODEL: {best_model_name}")

# 9. PREDICTION VALIDATION SYSTEM
print("\n=== Prediction Validation ===")

def validate_prediction(model, symptoms, feature_names, threshold=0.6):
    """
    Enhanced prediction with confidence checking and validation
    """
    if not hasattr(model, 'predict_proba'):
        return "Model does not support probability predictions", 0.0
    
    probabilities = model.predict_proba([symptoms])[0]
    max_prob = probabilities.max()
    predicted_class = model.classes_[probabilities.argmax()]
    predicted_disease = le.inverse_transform([predicted_class])[0]
    
    # Get top 3 predictions
    top_3_indices = probabilities.argsort()[-3:][::-1]
    top_predictions = [
        (le.inverse_transform([idx])[0], prob) 
        for idx, prob in zip(top_3_indices, probabilities[top_3_indices])
    ]
    
    print(f"\nüîç Prediction Analysis:")
    print(f"Top prediction: {predicted_disease} (confidence: {max_prob:.2%})")
    
    if max_prob < threshold:
        print(f"‚ö†Ô∏è  LOW CONFIDENCE: Below threshold ({threshold:.0%})")
        print("Considered predictions:")
        for disease, conf in top_predictions:
            print(f"  - {disease}: {conf:.2%}")
        return "Low confidence - consult healthcare professional", max_prob
    
    # Check if symptoms make sense for the prediction
    active_symptoms = [feature_names[i] for i, val in enumerate(symptoms) if val == 1]
    print(f"Active symptoms: {', '.join(active_symptoms)}")
    
    return predicted_disease, max_prob

# 10. TEST WITH SAMPLE SYMPTOMS
print("\n=== Testing Sample Predictions ===")

# Test cases with realistic symptom combinations
test_cases = [
    {'name': 'Common Cold', 'symptoms': ['runny_nose', 'sneezing', 'chills']},
    {'name': 'UTI Test', 'symptoms': ['burning_micturition', 'bladder_discomfort']},
    {'name': 'Fever Test', 'symptoms': ['high_fever']},
]

for test_case in test_cases:
    print(f"\nüß™ Testing: {test_case['name']}")
    
    # Create symptom vector
    symptom_vector = [0] * len(selected_features)
    active_count = 0
    
    for symptom in test_case['symptoms']:
        if symptom in selected_features:
            idx = list(selected_features).index(symptom)
            symptom_vector[idx] = 1
            active_count += 1
        else:
            print(f"  Note: Symptom '{symptom}' not in selected features")
    
    if active_count > 0:
        prediction, confidence = validate_prediction(
            best_model, symptom_vector, selected_features, threshold=0.7
        )
        print(f"‚úÖ Final prediction: {prediction} (Confidence: {confidence:.2%})")
    else:
        print("‚ùå No valid symptoms for prediction")

# 11. FEATURE IMPORTANCE ANALYSIS
print("\n=== Feature Importance Analysis ===")

if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': selected_features,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 most important features:")
    print(feature_importance_df.head(10))
    
    # Check if any features are dominating
    top_feature_ratio = feature_importance_df['importance'].iloc[0] / feature_importance_df['importance'].sum()
    if top_feature_ratio > 0.3:
        print(f"‚ö†Ô∏è  Warning: Top feature has {top_feature_ratio:.1%} of total importance")

print("\n" + "="*60)
print("MODEL READY FOR USE")
print("="*60)
print("‚úÖ Enhanced with:")
print("   - Better feature selection")
print("   - Confidence thresholding")
print("   - Multiple metric evaluation")
print("   - Prediction validation")
print("   - Cross-validation")
print("\n‚ö†Ô∏è  REMEMBER: This is for educational purposes only!")
print("   Always consult healthcare professionals for medical advice.")

=== Enhanced Disease Prediction Model ===

Training data: (4920, 134)
Test data: (42, 133)

=== Data Quality Analysis ===

Disease distribution in training data:
prognosis
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection      

In [2]:
# 12. SELECT AND SAVE THE BEST MODEL
print("\n" + "="*60)
print("MODEL SELECTION AND DEPLOYMENT")
print("="*60)

# Option 1: Auto-select best model based on F1-score
best_model_name = comparison_df['f1'].idxmax()
best_model = models[best_model_name]

print(f"ü§ñ AUTO-SELECTED BEST MODEL: {best_model_name}")
print(f"   F1-Score: {comparison_df.loc[best_model_name]['f1']:.4f}")
print(f"   Accuracy: {comparison_df.loc[best_model_name]['accuracy']:.4f}")



# 13. CREATE DEPLOYMENT PACKAGE
print("\n=== Creating Deployment Package ===")

# Save the model and necessary components
import joblib

deployment_package = {
    'model': best_model,
    'feature_selector': selector,
    'label_encoder': le,
    'selected_features': list(selected_features),
    'feature_names': list(X_train.columns),
    'model_name': best_model_name,
    'performance_metrics': comparison_df.loc[best_model_name].to_dict(),
    'confidence_threshold': 0.7  # Default confidence threshold
}

# Save the deployment package
joblib.dump(deployment_package, 'disease_prediction_model.pkl')
print("‚úÖ Model package saved as 'disease_prediction_model.pkl'")

# 14. VERIFY THE SAVED MODEL
print("\n=== Verifying Saved Model ===")

# Load and test the saved model
loaded_package = joblib.load('disease_prediction_model.pkl')
loaded_model = loaded_package['model']
loaded_features = loaded_package['selected_features']

print(f"‚úÖ Model loaded: {loaded_package['model_name']}")
print(f"‚úÖ Features: {len(loaded_features)}")
print(f"‚úÖ Classes: {len(loaded_package['label_encoder'].classes_)}")


MODEL SELECTION AND DEPLOYMENT
ü§ñ AUTO-SELECTED BEST MODEL: Random Forest
   F1-Score: 1.0000
   Accuracy: 1.0000

=== Creating Deployment Package ===
‚úÖ Model package saved as 'disease_prediction_model.pkl'

=== Verifying Saved Model ===
‚úÖ Model loaded: Random Forest
‚úÖ Features: 50
‚úÖ Classes: 41
