In [1]:
import pandas as pd
import numpy as np
import joblib
import pickle
from collections import Counter
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

Data Loading and Preprocessing

In [2]:
def load_and_analyze_data(file_path):
    """Load dataset and perform initial analysis"""
    df = pd.read_csv(file_path)
    
    print("=== DATA QUALITY CHECK ===")
    print(f"Dataset shape: {df.shape}")
    print(f"Unique diseases: {df['Disease'].nunique()}")
    print(f"Duplicate rows: {df.duplicated().sum()}")
    
    return df

def create_robust_features(df):
    """Create features while preserving symptom patterns"""
    
    # Get all unique symptoms
    all_symptoms = set()
    for col in df.columns[1:]:
        symptoms = df[col].dropna().unique()
        all_symptoms.update(symptoms)
    
    all_symptoms = sorted(list(all_symptoms))
    
    # Create feature matrix
    features = []
    labels = []
    symptom_patterns = []
    
    for _, row in df.iterrows():
        disease = row['Disease']
        patient_symptoms = [s for s in row[1:] if pd.notna(s)]
        
        # Create binary feature vector
        feature_vector = [1 if symptom in patient_symptoms else 0 
                         for symptom in all_symptoms]
        
        # Store symptom pattern as string
        symptom_pattern = '-'.join(sorted(patient_symptoms))
        
        features.append(feature_vector)
        labels.append(disease)
        symptom_patterns.append(symptom_pattern)
    
    return np.array(features), np.array(labels), all_symptoms, symptom_patterns

Training AdaBoost Model

In [3]:
def train_and_evaluate_model(X, y):
    """Train AdaBoost model and evaluate performance"""
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)
    
    # Create AdaBoost model
    ada_classifier = OneVsRestClassifier(
        AdaBoostClassifier(
            estimator=DecisionTreeClassifier(
                max_depth=3,  # Reduced depth to prevent overfitting
                min_samples_split=5,
                min_samples_leaf=2
            ),
            n_estimators=30,  # Reduced number of estimators
            learning_rate=0.8,
            random_state=42
        )
    )
    
    # Train and evaluate
    print("\n=== MODEL TRAINING ===")
    ada_classifier.fit(X_train, y_train_encoded)
    y_pred = ada_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Cross-Validation
    print("\n=== CROSS-VALIDATION ===")
    cv_scores = cross_val_score(ada_classifier, X, y_encoded, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Detailed Analysis
    print("\n=== DETAILED ANALYSIS ===")
    print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))
    
    return ada_classifier, label_encoder

def save_model_components(model, label_encoder, symptom_names):
    """Save trained model and components for later use"""
    joblib.dump(model, 'disease_predictor_model.pkl')
    joblib.dump(label_encoder, 'label_encoder.pkl')
    
    with open('symptom_names.pkl', 'wb') as f:
        pickle.dump(symptom_names, f)
    
    print("Model and components saved successfully!")


Prediction Systems

In [14]:
class WeightedDiseasePredictor:
    def __init__(self, model_path, encoder_path, symptoms_path):
        self.model = joblib.load(model_path)
        self.label_encoder = joblib.load(encoder_path)
        with open(symptoms_path, 'rb') as f:
            self.symptom_names = [s.strip() for s in pickle.load(f)]
        
        # Create symptom weights based on specificity
        self.symptom_weights = self._calculate_symptom_weights()
        print(f"‚úÖ Weighted predictor loaded with {len(self.symptom_names)} symptoms")
    
    def _calculate_symptom_weights(self):
        """Calculate weights based on how specific each symptom is"""
        return {symptom: 1.0 for symptom in self.symptom_names}
    
    def find_best_symptom_match(self, input_symptom):
        """Enhanced symptom matching that handles various spacing issues"""
        input_symptom = input_symptom.strip().lower()
        
        # Normalize the input - remove extra spaces around underscores
        normalized_input = input_symptom.replace(' _', '_').replace('_ ', '_')
        
        # Try multiple matching strategies
        strategies = [
            # 1. Exact match with normalized input
            lambda s: s.lower().strip() == normalized_input,
            # 2. Contains match with normalized input  
            lambda s: normalized_input in s.lower(),
            # 3. Match with spaces converted to underscores
            lambda s: s.lower().strip() == input_symptom.replace(' ', '_'),
            # 4. Match with underscores converted to spaces
            lambda s: s.lower().strip() == input_symptom.replace('_', ' '),
            # 5. Flexible contains match
            lambda s: any(term in s.lower() for term in normalized_input.split('_')),
        ]
        
        for strategy in strategies:
            for symptom in self.symptom_names:
                if strategy(symptom):
                    return symptom
        
        return None
    
    def predict_with_confidence_boost(self, input_symptoms):
        """Predict with confidence normalization for better results"""
        # Map symptoms
        mapped_symptoms = []
        mapping_details = []
        
        for symptom in input_symptoms:
            matched = self.find_best_symptom_match(symptom)
            if matched:
                mapped_symptoms.append(matched)
                mapping_details.append(f"'{symptom}' ‚Üí '{matched}'")
            else:
                mapping_details.append(f"'{symptom}' ‚Üí NOT FOUND")
        
        print(f"üìã Symptom mapping: {mapping_details}")
        
        # Create feature vector
        feature_vector = np.zeros(len(self.symptom_names))
        for i, symptom in enumerate(self.symptom_names):
            if symptom in mapped_symptoms:
                feature_vector[i] = 1
        
        # Get raw probabilities
        raw_probabilities = self.model.predict_proba([feature_vector])[0]
        
        # Apply confidence boost for cases with good symptom matches
        symptom_count = len(mapped_symptoms)
        confidence_boost = min(2.0, 1.0 + (symptom_count / 10))
        
        # Normalize probabilities
        boosted_probs = raw_probabilities * confidence_boost
        boosted_probs = np.minimum(boosted_probs, 1.0)  # Cap at 100%
        
        # Get predictions
        all_predictions = []
        for i, prob in enumerate(boosted_probs):
            disease = self.label_encoder.inverse_transform([i])[0]
            all_predictions.append((disease, prob))
        
        all_predictions.sort(key=lambda x: x[1], reverse=True)
        
        # Calculate confidence metrics
        top_prob = all_predictions[0][1] if all_predictions else 0
        second_prob = all_predictions[1][1] if len(all_predictions) > 1 else 0
        confidence_gap = top_prob - second_prob
        
        # Enhanced confidence levels
        if top_prob > 0.3 and confidence_gap > 0.1:
            confidence_level = "HIGH"
        elif top_prob > 0.2 and confidence_gap > 0.05:
            confidence_level = "MEDIUM"
        else:
            confidence_level = "LOW"
        
        return {
            'primary_prediction': all_predictions[0][0],
            'confidence': top_prob,
            'confidence_level': confidence_level,
            'all_predictions': all_predictions[:5],
            'symptoms_matched': len(mapped_symptoms),
            'confidence_gap': confidence_gap,
            'mapped_symptoms': mapped_symptoms,
            'mapping_details': mapping_details
        }


Testing System

In [19]:
def test_prediction_system(predictor, test_cases):
    """Test the prediction system with sample symptoms"""
    print("=" * 70)
    print("PREDICTION SYSTEM TESTING")
    print("=" * 70)
    
    for symptoms in test_cases:
        result = predictor.predict_with_confidence_boost(symptoms)
        
        print(f"\nInput Symptoms: {', '.join(symptoms)}")
        print(f"   Mapped Symptoms: {', '.join(result['mapped_symptoms'])}")
        print(f"   Prediction: {result['primary_prediction']}")
        print(f"   Confidence: {result['confidence']:.2%} ({result['confidence_level']})")
        print(f"   Symptoms Matched: {result['symptoms_matched']}/{len(symptoms)}")
        
        print(f"\n   üè• Top Predictions:")
        for i, (disease, prob) in enumerate(result['all_predictions'], 1):
            print(f"      {i}. {disease}: {prob:.2%}")
        
        print("-" * 60)

def interactive_final_system():
    """For user to interact with the disease prediction system"""
    
    predictor = WeightedDiseasePredictor(
        'disease_predictor_model.pkl',
        'label_encoder.pkl', 
        'symptom_names.pkl'
    )
    
    print("üéì DISEASE PREDICTION SYSTEM")
    print("=" * 50)
    
    while True:
        print("\nEnter symptoms (comma-separated)")
        print("Examples: itching, skin_rash, watering_from_eyes")
        print("Type 'quit' to exit")
        print("-" * 40)
        
        user_input = input("Symptoms: ").strip()
        
        if user_input.lower() == 'quit':
            break
            
        # Convert spaces to underscores automatically
        symptoms = [s.strip().replace(' ', '_') for s in user_input.split(',')]
        
        print(f"Analyzing: {', '.join(symptoms)}")
        result = predictor.predict_with_confidence_boost(symptoms)
        
        if result and result['symptoms_matched'] > 0:
            print(f"\nPrediction: {result['primary_prediction']}")
            print(f"Confidence: {result['confidence']:.2%}")
            print(f"Matched: {result['symptoms_matched']} symptoms")
        else:
            print("‚ùå No symptoms matched. Try using underscores (e.g., watering_from_eyes)")
        
        print(f"\n‚ö†Ô∏è  Educational use only - consult doctors for medical advice.")


In [20]:
if __name__ == "__main__":
    # Load and prepare data
    df = load_and_analyze_data("C:/Users/kendr/Downloads/archive/dataset.csv")
    
    # Create features
    X, y, symptom_names, symptom_patterns = create_robust_features(df)
    
    # Analyze symptom patterns
    pattern_counts = Counter(symptom_patterns)
    print(f"\nUnique symptom patterns: {len(pattern_counts)}")
    print(f"Total samples: {len(X)}")
    
    # Train model
    ada_classifier, label_encoder = train_and_evaluate_model(X, y)
    
    # Save model components
    save_model_components(ada_classifier, label_encoder, symptom_names)
    
    # Test the prediction system
    test_cases = [
        ['itching', 'skin_rash', 'nodal_skin_eruptions', 'dischromic_patches'],
        ['continuous_sneezing', 'shivering', 'chills', 'watering_from_eyes'],
        ['stomach_pain', 'acidity', 'ulcers_on_tongue', 'vomiting', 'cough']
    ]
    
    predictor = WeightedDiseasePredictor(
        'disease_predictor_model.pkl',
        'label_encoder.pkl', 
        'symptom_names.pkl'
    )
    
    test_prediction_system(predictor, test_cases)
    

=== DATA QUALITY CHECK ===
Dataset shape: (4920, 18)
Unique diseases: 41
Duplicate rows: 4616

Unique symptom patterns: 304
Total samples: 4920

=== MODEL TRAINING ===
Test Accuracy: 0.9966

=== CROSS-VALIDATION ===
Cross-validation scores: [1.         0.99695122 1.         1.         1.        ]
Mean CV accuracy: 0.9994 (+/- 0.0024)

=== DETAILED ANALYSIS ===
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        36
                                   AIDS       1.00      1.00      1.00        36
                                   Acne       1.00      1.00      1.00        36
                    Alcoholic hepatitis       1.00      0.86      0.93        36
                                Allergy       1.00      1.00      1.00        36
                              Arthritis       1.00      1.00      1.00        36
                       Bronchial Asthma       1.00      1.00      1.0

In [21]:
interactive_final_system()

‚úÖ Weighted predictor loaded with 131 symptoms
üéì DISEASE PREDICTION SYSTEM

Enter symptoms (comma-separated)
Examples: itching, skin_rash, watering_from_eyes
Type 'quit' to exit
----------------------------------------


Symptoms:  fever, joint pain, skin rash, muscle pain


Analyzing: fever, joint_pain, skin_rash, muscle_pain
üìã Symptom mapping: ["'fever' ‚Üí 'high_fever'", "'joint_pain' ‚Üí 'joint_pain'", "'skin_rash' ‚Üí 'skin_rash'", "'muscle_pain' ‚Üí 'muscle_pain'"]

Prediction: Hepatitis D
Confidence: 4.44%
Matched: 4 symptoms

‚ö†Ô∏è  Educational use only - consult doctors for medical advice.

Enter symptoms (comma-separated)
Examples: itching, skin_rash, watering_from_eyes
Type 'quit' to exit
----------------------------------------


Symptoms:  toxic look, constipation


Analyzing: toxic_look, constipation
üìã Symptom mapping: ["'toxic_look' ‚Üí 'toxic_look_(typhos)'", "'constipation' ‚Üí 'constipation'"]

Prediction: Typhoid
Confidence: 18.64%
Matched: 2 symptoms

‚ö†Ô∏è  Educational use only - consult doctors for medical advice.

Enter symptoms (comma-separated)
Examples: itching, skin_rash, watering_from_eyes
Type 'quit' to exit
----------------------------------------


Symptoms:  chills, vomiting, fatigue, cough, high_fever, breathlessness


Analyzing: chills, vomiting, fatigue, cough, high_fever, breathlessness
üìã Symptom mapping: ["'chills' ‚Üí 'chills'", "'vomiting' ‚Üí 'vomiting'", "'fatigue' ‚Üí 'fatigue'", "'cough' ‚Üí 'cough'", "'high_fever' ‚Üí 'high_fever'", "'breathlessness' ‚Üí 'breathlessness'"]

Prediction: Heart attack
Confidence: 9.17%
Matched: 6 symptoms

‚ö†Ô∏è  Educational use only - consult doctors for medical advice.

Enter symptoms (comma-separated)
Examples: itching, skin_rash, watering_from_eyes
Type 'quit' to exit
----------------------------------------


Symptoms:  malaise, phlegm, chest_pain, blood_in_sputum


Analyzing: malaise, phlegm, chest_pain, blood_in_sputum
üìã Symptom mapping: ["'malaise' ‚Üí 'malaise'", "'phlegm' ‚Üí 'phlegm'", "'chest_pain' ‚Üí 'chest_pain'", "'blood_in_sputum' ‚Üí 'blood_in_sputum'"]

Prediction: Tuberculosis
Confidence: 21.53%
Matched: 4 symptoms

‚ö†Ô∏è  Educational use only - consult doctors for medical advice.

Enter symptoms (comma-separated)
Examples: itching, skin_rash, watering_from_eyes
Type 'quit' to exit
----------------------------------------


Symptoms:  quit
