In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import pickle
import json

# Load dataset
print("Loading datasets...")
df = pd.read_csv('/kaggle/input/aiprojectdatasest/ChronicIllnessDataset_Modified.csv')

# Load precautions dataset
precaution_df = pd.read_csv('/kaggle/input/aiprojectdatasest/precaution_updated_with_lifestyle.csv')
precaution_df.columns = precaution_df.columns.str.strip()
precaution_df['Disease'] = precaution_df['Disease'].str.strip().str.lower()
precaution_df['All_Precautions'] = precaution_df['All_Precautions'].fillna('')

# Load description dataset
desc_df = pd.read_csv("/kaggle/input/aiprojectdatasest/Modified_Symptom_Description_with_PatientInfo.csv")
desc_df['Disease'] = desc_df['Disease'].str.lower().str.strip()
description_map = dict(zip(desc_df['Disease'], desc_df['Description']))
print("Dataset Loaded succesfully");

Loading datasets...
Dataset Loaded succesfully


In [2]:
print("Preprocessing data...")

# Combine all symptom columns
symptom_cols = [col for col in df.columns if col.startswith('Symptom ')]
df['All_Symptoms'] = df[symptom_cols].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Add or placeholder columns for smoker status and medical conditions
# (If these aren't in your original dataset, we'll add them with placeholder values)
if 'Smoker' not in df.columns:
    df['Smoker'] = np.random.choice([0, 1], size=len(df))  # 0=non-smoker, 1=smoker
if 'Diabetes' not in df.columns:
    df['Diabetes'] = np.random.choice([0, 1], size=len(df))  # 0=no, 1=yes
if 'Hypertension' not in df.columns:
    df['Hypertension'] = np.random.choice([0, 1], size=len(df))  # 0=no, 1=yes
if 'HeartDisease' not in df.columns:
    df['HeartDisease'] = np.random.choice([0, 1], size=len(df))  # 0=no, 1=yes

# Encode categorical data
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

le_disease = LabelEncoder()
df['Disease_encoded'] = le_disease.fit_transform(df['Disease'])

# Save encoders for frontend use
with open('gender_encoder.pkl', 'wb') as f:
    pickle.dump(le_gender, f)
with open('disease_encoder.pkl', 'wb') as f:
    pickle.dump(le_disease, f)


disease_mapping = dict(zip(le_disease.transform(le_disease.classes_), le_disease.classes_))
disease_mapping = {int(k): v for k, v in disease_mapping.items()}

with open('disease_mapping.json', 'w') as f:
    json.dump(disease_mapping, f)


Preprocessing data...


In [3]:
print("Feature engineering...")

# TF-IDF Vectorizer for symptoms
tfidf = TfidfVectorizer()
symptom_vectors = tfidf.fit_transform(df['All_Symptoms'])

# Save TF-IDF vectorizer for frontend use
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Combine features - now including the additional health condition columns
X_numerical = df[['Age', 'Gender', 'Stage', 'Smoker', 'Diabetes', 'Hypertension', 'HeartDisease']].values
X = hstack([symptom_vectors, X_numerical])

# Target variable
y = df['Disease_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Feature engineering...


In [4]:
print("Training Random Forest model...")

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le_disease.classes_))

# Save model for frontend use
with open('disease_prediction_model.pkl', 'wb') as f:
    pickle.dump(model, f)


Training Random Forest model...
Accuracy: 1.0
Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal Positional Vertigo       1.00      1.00      1.00        18
                                  AIDS       1.00      1.00      1.00        30
                                  Acne       1.00      1.00      1.00        24
                   Alcoholic hepatitis       1.00      1.00      1.00        25
                               Allergy       1.00      1.00      1.00        24
                             Arthritis       1.00      1.00      1.00        23
                      Bronchial Asthma       1.00      1.00      1.00        33
                  Cervical spondylosis       1.00      1.00      1.00        23
                           Chicken pox       1.00      1.00      1.00        21
                   Chronic cholestasis       1.00      1.00      1.00        15
                           Common Cold       1.00

In [5]:
# ----------------------
# Utility Functions
# ----------------------

# Function to get precautions
def get_precautions(disease_name):
    disease_name = disease_name.strip().lower()
    row = precaution_df[precaution_df['Disease'] == disease_name]
    if row.empty:
        return ["No precautions found."]

    precaution_text = row.iloc[0]['All_Precautions']
    precautions = [p.strip() for p in precaution_text.split(',') if p.strip()]
    return precautions

# Group symptoms by disease
disease_symptom_map = df.groupby('Disease')['All_Symptoms'].apply(
    lambda x: x.mode().iloc[0] if not x.mode().empty else "").to_dict()

# Function to get symptoms by disease name
def get_symptoms_for_disease(disease_name):
    symptoms = disease_symptom_map.get(disease_name)
    if symptoms:
        return f"Most common symptoms for '{disease_name}':\n→ {symptoms}"
    else:
        return f"Disease '{disease_name}' not found in dataset."

# Function to predict disease with probabilities
def predict_disease(symptoms, age, gender, stage, smoker=0, diabetes=0, hypertension=0, heart_disease=0):
    """
    Predict disease based on symptoms and patient information
    
    Parameters:
    symptoms (str): Text description of symptoms
    age (int): Patient age
    gender (int): Encoded gender (0=male, 1=female)
    stage (int): Disease stage
    smoker (int): Smoking status (0=non-smoker, 1=smoker)
    diabetes (int): Diabetes status (0=no, 1=yes)
    hypertension (int): Hypertension status (0=no, 1=yes)
    heart_disease (int): Heart disease status (0=no, 1=yes)
    
    Returns:
    dict: Prediction results with diseases, confidence scores, and recommendations
    """
    # Process symptom text
    symptom_vector = tfidf.transform([symptoms])
    
    # Combine with numerical features
    num_features = np.array([[age, gender, stage, smoker, diabetes, hypertension, heart_disease]])
    X_pred = hstack([symptom_vector, num_features])
    
    # Get probability distribution
    probs = model.predict_proba(X_pred)[0]
    
    # Get top 3 predictions with confidence
    top_indices = probs.argsort()[-3:][::-1]
    predictions = []
    
    for idx in top_indices:
        disease = le_disease.inverse_transform([idx])[0]
        confidence = probs[idx] * 100
        description = description_map.get(disease.lower(), "No description available.")
        precautions = get_precautions(disease)
        
        # Custom recommendations based on medical conditions
        custom_recs = []
        if smoker == 1:
            custom_recs.append("Consider a smoking cessation program to improve overall health")
        if diabetes == 1:
            custom_recs.append("Monitor blood sugar levels regularly")
        if hypertension == 1:
            custom_recs.append("Follow a low-sodium diet and take prescribed blood pressure medications")
        if heart_disease == 1:
            custom_recs.append("Regular cardiac check-ups and follow heart-healthy diet")
        
        predictions.append({
            'disease': disease,
            'confidence': float(confidence),  # Convert numpy float to Python float for JSON serialization
            'description': description,
            'precautions': precautions,
            'custom_recommendations': custom_recs
        })
    
    return {
        'predictions': predictions,
        'input': {
            'symptoms': symptoms,
            'age': int(age),
            'gender': 'Female' if gender == 1 else 'Male',
            'stage': int(stage),
            'smoker': bool(smoker),
            'diabetes': bool(diabetes),
            'hypertension': bool(hypertension),
            'heart_disease': bool(heart_disease)
        }
    }


In [6]:
if __name__ == "__main__":
    # Test with sample input
    sample_result = predict_disease(
        symptoms="persistent cough, fever, fatigue",
        age=45,
        gender=0,  # Male
        stage=2,
        smoker=1,  # Smoker
        diabetes=0,  # No diabetes
        hypertension=1,  # Has hypertension
        heart_disease=0  # No heart disease
    )
    
    print("\nSample Prediction Result:")
    print(f"Patient Profile: {sample_result['input']}")
    
    print("\nTop Disease Predictions:")
    for i, pred in enumerate(sample_result['predictions']):
        print(f"\n{i+1}. {pred['disease']} - {pred['confidence']:.2f}% confidence")
        print(f"Description: {pred['description'][:100]}...")
        
        print("Precautions:")
        for p in pred['precautions'][:3]:
            print(f"  • {p}")
            
        if pred['custom_recommendations']:
            print("Custom Recommendations (based on medical history):")
            for r in pred['custom_recommendations']:
                print(f"  • {r}")
    
    print("\nDISCLAIMER: This prediction is for informational purposes only and not a medical diagnosis.")


Sample Prediction Result:
Patient Profile: {'symptoms': 'persistent cough, fever, fatigue', 'age': 45, 'gender': 'Male', 'stage': 2, 'smoker': True, 'diabetes': False, 'hypertension': True, 'heart_disease': False}

Top Disease Predictions:

1. Bronchial Asthma - 38.00% confidence
Description: Bronchial asthma is a medical condition which causes the airway path of the lungs to swell and narro...
Precautions:
  • switch to loose cloothing
  • take deep breaths
  • get away from trigger
Custom Recommendations (based on medical history):
  • Consider a smoking cessation program to improve overall health
  • Follow a low-sodium diet and take prescribed blood pressure medications

2. GERD - 9.00% confidence
Description: Gastroesophageal reflux disease, or GERD, is a digestive disorder that affects the lower esophageal ...
Precautions:
  • avoid fatty spicy food
  • avoid lying down after eating
  • maintain healthy weight
Custom Recommendations (based on medical history):
  • Consider a smo