In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

# Disclaimer
print("WARNING: This is an educational tool only. Not for real medical use. Consult a doctor!")

# Step 1: Load and Preprocess Data
df = pd.read_csv('archive/dataset.csv')  # Ensure dataset.csv is in your directory
print(df.head())

            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

diseases = {
    "Common Cold": ["runny nose", "sore throat", "cough", "sneezing", "fatigue"],
    # ... more diseases
}

In [4]:
# Get all unique symptoms (excluding NaN)
symptom_columns = [col for col in df.columns if col.startswith('Symptom_')]
all_symptoms = set()
for col in symptom_columns:
    all_symptoms.update(df[col].dropna().unique())
all_symptoms = sorted(list(all_symptoms))  # Sorted for consistency
print(f"Found {len(all_symptoms)} unique symptoms: {all_symptoms}")

# Create binary feature matrix
X = pd.DataFrame(0, index=df.index, columns=all_symptoms)
for idx, row in df.iterrows():
    for col in symptom_columns:
        symptom = row[col]
        if pd.notna(symptom):
            X.loc[idx, symptom] = 1

y = df['Disease']  # Target: disease names

# Encode diseases
le = LabelEncoder()
y_encoded = le.fit_transform(y)
disease_names = le.classes_  # For decoding predictions

# Step 2: Train Model
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Quick accuracy check
y_pred = model.predict(X_test)
print(f"Model Accuracy on Test Set: {accuracy_score(y_test, y_pred):.2f}")

# Step 3: User Input Mapping
# Map user-friendly symptom names to dataset symptoms (expand as needed)
symptom_mapping = {
    'itchy skin': 'itching', 'itching': 'itching', 'skin rash': 'skin_rash', 'rash': 'skin_rash',
    'nodal skin eruptions': 'nodal_skin_eruptions', 'discolored patches': 'dischromic _patches',
    'fever': 'fever', 'cough': 'cough', 'fatigue': 'fatigue', 'sore throat': 'sore_throat',
    'headache': 'headache', 'runny nose': 'runny_nose', 'sneezing': 'sneezing',
    'body aches': 'muscle_pain', 'shortness of breath': 'breathlessness',
    'nausea': 'nausea', 'chest pain': 'chest_pain'
    # Add more mappings based on all_symptoms list printed above
}

user_input = input("Enter your symptoms separated by commas (e.g., itching, skin rash, fatigue): ")
user_symptoms = [symptom.strip().lower() for symptom in user_input.split(",")]

# Create feature vector
user_vector = np.zeros(len(all_symptoms))
matched_symptoms = []

for symptom in user_symptoms:
    if symptom in symptom_mapping and symptom_mapping[symptom] in all_symptoms:
        col_idx = all_symptoms.index(symptom_mapping[symptom])
        user_vector[col_idx] = 1
        matched_symptoms.append(symptom)
    else:
        print(f"Warning: '{symptom}' not found in dataset. Try similar terms or check spelling.")

if not matched_symptoms:
    print("No matching symptoms found. Try different terms or consult a doctor.")
else:
    # Step 4: Predict
    user_vector = user_vector.reshape(1, -1)  # Reshape for prediction
    prediction_encoded = model.predict(user_vector)[0]
    probabilities = model.predict_proba(user_vector)[0]
    
    predicted_disease = le.inverse_transform([prediction_encoded])[0]
    top_prob = np.max(probabilities)
    
    print(f"\nPredicted Disease: {predicted_disease} (Confidence: {top_prob:.1%})")
    print(f"Based on matched symptoms: {', '.join(matched_symptoms)}")
    
    # Show top 3 alternatives
    top_indices = np.argsort(probabilities)[-3:][::-1]
    print("\nTop Alternatives:")
    for i, idx in enumerate(top_indices):
        alt_disease = le.inverse_transform([idx])[0]
        alt_prob = probabilities[idx]
        if alt_prob > 0.05:  # Only show if somewhat likely
            print(f"{i+1}. {alt_disease}: {alt_prob:.1%}")

Found 131 unique symptoms: [' abdominal_pain', ' abnormal_menstruation', ' acidity', ' acute_liver_failure', ' altered_sensorium', ' anxiety', ' back_pain', ' belly_pain', ' blackheads', ' bladder_discomfort', ' blister', ' blood_in_sputum', ' bloody_stool', ' blurred_and_distorted_vision', ' breathlessness', ' brittle_nails', ' bruising', ' burning_micturition', ' chest_pain', ' chills', ' cold_hands_and_feets', ' coma', ' congestion', ' constipation', ' continuous_feel_of_urine', ' continuous_sneezing', ' cough', ' cramps', ' dark_urine', ' dehydration', ' depression', ' diarrhoea', ' dischromic _patches', ' distention_of_abdomen', ' dizziness', ' drying_and_tingling_lips', ' enlarged_thyroid', ' excessive_hunger', ' extra_marital_contacts', ' family_history', ' fast_heart_rate', ' fatigue', ' fluid_overload', ' foul_smell_of urine', ' headache', ' high_fever', ' hip_joint_pain', ' history_of_alcohol_consumption', ' increased_appetite', ' indigestion', ' inflammatory_nails', ' intern

