In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/processed.csv")

In [3]:
symptom_columns = [col for col in df.columns if 'Symptoms' in col]

all_symptoms = pd.unique(df[symptom_columns].values.ravel())
unique_symptoms = [s for s in all_symptoms if pd.notna(s)] # Remove NaN values

In [4]:
# Strip spaces around each symptom in the comma-separated string
df['Symptoms'] = df['Symptoms'].apply(
    lambda x: ', '.join(sym.strip().lower().replace(' ', '') for sym in x.split(','))
)

In [5]:
df['Symptoms'] = df['Symptoms'].apply(lambda x: [sym.strip() for sym in x.split(',')])

### Extract all unique symptoms for one hot encoding

In [6]:
# Step 1: Extract unique symptoms from all rows
all_symptoms = set()
for symptoms in df['Symptoms']:
    all_symptoms.update(symptoms)

# Sort the list for consistency
unique_symptoms = sorted(list(all_symptoms))
print(f"Total unique symptoms: {len(unique_symptoms)}")

Total unique symptoms: 131


### Create One Hot Encoded Features

In [7]:
mlb = MultiLabelBinarizer()
one_hot = pd.DataFrame(mlb.fit_transform(df['Symptoms']), columns=mlb.classes_)

# Combine with original dataframe
df = pd.concat([df.drop(columns=['Symptoms']), one_hot], axis=1)


### Encode Disease Labels as Target `y`

In [8]:
# Initialize encoder
le = LabelEncoder()

# Fit and transform the Disease column
df['Disease_Label'] = le.fit_transform(df['Disease'])

# Store the target
y = df['Disease_Label']

### Store `X` as Binary Symptom Vectors


In [9]:
# Create X by selecting only symptom one-hot columns
symptom_cols = [col for col in df.columns if col not in ['Disease', 'Disease_Label', 'Description', 'Precautions']]

X = df[symptom_cols]

#### Split the data into train and test sets 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Initialize the model and train the model

In [11]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

In [13]:
# Write a function to take user symptoms as input string

def get_user_symptoms():
    user_input = input("Enter your symptoms separated by commas: ")
    # Clean and standardize
    symptoms = [s.strip().lower().replace(" ", "_") for s in user_input.split(",")]
    return symptoms

In [14]:
#  Encode user input into binary vector

def encode_symptoms(user_symptoms, symptom_list):
    # Initialize all 0s
    input_vector = [0] * len(symptom_list)
    
    # Mark 1 for symptoms present
    for idx, symptom in enumerate(symptom_list):
        if symptom in user_symptoms:
            input_vector[idx] = 1
    return [input_vector]  # Return 2D array (model expects batch)


In [15]:
# Predict the disease using the trained model

def predict_disease(input_vector, model, label_encoder):
    pred_encoded = model.predict(input_vector)[0]
    disease = label_encoder.inverse_transform([pred_encoded])[0]
    return disease


In [16]:
# Retrieve and display additional information

def get_disease_info(disease, df):
    row = df[df['Disease'] == disease].iloc[0]
    return row['Description'], row['Precautions']


In [17]:
def disease_prediction_interface(df, model, label_encoder):
    symptom_list = list(X.columns)  # Your one-hot symptom feature list
    user_symptoms = get_user_symptoms()
    
    input_vector = encode_symptoms(user_symptoms, symptom_list)
    predicted_disease = predict_disease(input_vector, model, label_encoder)
    
    description, precautions = get_disease_info(predicted_disease, df)
    
    print("\n🧾 Predicted Disease:", predicted_disease)
    print("\n📖 Description:\n", description)
    print("\n🩺 Precautions:")
    for p in precautions.split(','):
        print("-", p.strip())


In [18]:
# save the model
joblib.dump(model, '../model/disease_predictor_model.joblib')

# Save the list of all symptoms (column names used for encoding)
all_symptoms = list(X.columns)
joblib.dump(all_symptoms, '../model/meta/symptom_list.pkl')

# Create a dictionary: disease -> {description, precautions}
metadata = {}
for _, row in df.iterrows():
    metadata[row['Disease']] = {
        "description": row['Description'],
        "precautions": [p.strip() for p in row['Precautions'].split(',')]
    }

# Save this metadata dictionary
joblib.dump(metadata, '../model/meta/disease_metadata.pkl')

# Save this Label Encoder
joblib.dump(le, '../model/meta/label_encoder.pkl')


['../model/meta/label_encoder.pkl']

In [19]:
# Load everything when reusing the model
model = joblib.load('../model/disease_predictor_model.joblib')
symptom_list = joblib.load('../model/meta/symptom_list.pkl')
metadata = joblib.load('../model/meta/disease_metadata.pkl')
le = joblib.load('../model/meta/label_encoder.pkl')
