In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import os

In [4]:
os.environ['SCIPY_ARRAY_API'] = '1'

In [5]:
df = pd.read_csv('Final_Augmented_dataset_Diseases_and_Symptoms.csv')

In [6]:
# The 52 diseases you want to include in your model
diseases_to_keep = [
    'acute bronchiolitis', 'acute bronchitis', 'acute pancreatitis',
    'acute stress reaction', 'anxiety', 'arthritis of the hip',
    'benign prostatic hyperplasia (bph)', 'bursitis', 'cholecystitis',
    'chronic constipation', 'complex regional pain syndrome', 'concussion',
    'conjunctivitis due to allergy', 'cystitis', 'dental caries', 'depression',
    'developmental disability', 'diverticulitis', 'eczema', 'esophagitis',
    'fungal infection of the hair', 'gastrointestinal hemorrhage', 'gout',
    'hyperemesis gravidarum', 'hypoglycemia', 'infectious gastroenteritis',
    'injury to the arm', 'injury to the leg', 'liver disease', 'marijuana abuse',
    'multiple sclerosis', 'noninfectious gastroenteritis', 'nose disorder',
    'obstructive sleep apnea (osa)', 'otitis media',
    'peripheral nerve disorder', 'personality disorder', 'pneumonia',
    'problem during pregnancy', 'psoriasis', 'pyogenic skin infection',
    'seasonal allergies (hay fever)', 'sebaceous cyst', 'sickle cell crisis',
    'spinal stenosis', 'spondylosis', 'spontaneous abortion', 'sprain or strain',
    'strep throat', 'urinary tract infection', 'vaginal cyst', 'vulvodynia'
]

In [2]:
# The 30 feature columns you want to use
features_to_keep = [
    'headache', 'burning abdominal pain', 'vomiting', 'pelvic pain',
    'back pain', 'disturbance of memory', 'vomiting blood',
    'sharp abdominal pain', 'nausea', 'ache all over', 'weakness',
    'abusing alcohol', 'problems with movement', 'sharp chest pain',
    'joint pain', 'chills', 'foot or toe pain',
    'abnormal involuntary movements', 'suprapubic pain', 'knee pain',
    'fever', 'rectal bleeding', 'lacrimation', 'decreased appetite',
    'leg pain', 'restlessness', 'constipation', 'vaginal discharge',
    'side pain', 'skin moles'
]

In [3]:
len(features_to_keep)

30

# --- Step 2: Filter and Select Data to Create df_clean ---

In [8]:
## Rows
df_filtered_rows = df[df['diseases'].isin(diseases_to_keep)].copy()

In [9]:
df_filtered_rows.shape

(58419, 378)

In [10]:
columns_to_select = features_to_keep + ['diseases']

In [11]:
columns_to_select

['headache',
 'burning abdominal pain',
 'vomiting',
 'pelvic pain',
 'back pain',
 'disturbance of memory',
 'vomiting blood',
 'sharp abdominal pain',
 'nausea',
 'ache all over',
 'weakness',
 'abusing alcohol',
 'problems with movement',
 'sharp chest pain',
 'joint pain',
 'chills',
 'foot or toe pain',
 'abnormal involuntary movements',
 'suprapubic pain',
 'knee pain',
 'fever',
 'rectal bleeding',
 'lacrimation',
 'decreased appetite',
 'leg pain',
 'restlessness',
 'constipation',
 'vaginal discharge',
 'side pain',
 'skin moles',
 'diseases']

In [12]:
df_clean = df_filtered_rows[columns_to_select]

In [13]:
df_clean.reset_index(drop=True, inplace=True)

In [14]:
df_clean.head()

Unnamed: 0,headache,burning abdominal pain,vomiting,pelvic pain,back pain,disturbance of memory,vomiting blood,sharp abdominal pain,nausea,ache all over,...,rectal bleeding,lacrimation,decreased appetite,leg pain,restlessness,constipation,vaginal discharge,side pain,skin moles,diseases
0,1,0,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,problem during pregnancy
1,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,problem during pregnancy
2,0,0,0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,problem during pregnancy
3,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,problem during pregnancy
4,0,0,1,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,problem during pregnancy


In [15]:
df_clean.shape

(58419, 31)

In [16]:
df_clean.isnull().sum()   

headache                          0
burning abdominal pain            0
vomiting                          0
pelvic pain                       0
back pain                         0
disturbance of memory             0
vomiting blood                    0
sharp abdominal pain              0
nausea                            0
ache all over                     0
weakness                          0
abusing alcohol                   0
problems with movement            0
sharp chest pain                  0
joint pain                        0
chills                            0
foot or toe pain                  0
abnormal involuntary movements    0
suprapubic pain                   0
knee pain                         0
fever                             0
rectal bleeding                   0
lacrimation                       0
decreased appetite                0
leg pain                          0
restlessness                      0
constipation                      0
vaginal discharge           

In [17]:
df_clean['diseases'].value_counts()

diseases
cystitis                              1219
nose disorder                         1218
vulvodynia                            1218
complex regional pain syndrome        1217
spondylosis                           1216
conjunctivitis due to allergy         1215
hypoglycemia                          1215
vaginal cyst                          1215
esophagitis                           1215
peripheral nerve disorder             1215
diverticulitis                        1214
gastrointestinal hemorrhage           1214
acute bronchitis                      1213
spontaneous abortion                  1212
sprain or strain                      1212
fungal infection of the hair          1212
pneumonia                             1212
infectious gastroenteritis            1212
gout                                  1211
marijuana abuse                       1210
strep throat                          1210
arthritis of the hip                  1210
liver disease                         1209
in

In [18]:
print(f"Shape of the clean DataFrame: {df_clean.shape}")
print(f"Number of unique diseases in clean DataFrame: {df_clean['diseases'].nunique()}")

Shape of the clean DataFrame: (58419, 31)
Number of unique diseases in clean DataFrame: 52


# Label Encoding

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
le = LabelEncoder()


In [21]:
X = df_clean.drop('diseases', axis=1)
y_text = df_clean['diseases']

In [22]:
len(y_text.unique())

52

In [23]:
le = LabelEncoder()
y_encoded = le.fit_transform(y_text)

In [24]:
np.unique(y_encoded)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51])

# --- Step 3: Prepare Data for SMOTE ---


In [25]:
smote = SMOTE(random_state=42)

In [26]:
X_balanced, y_balanced_encoded = smote.fit_resample(X, y_encoded)

In [27]:
y_balanced_series = pd.Series(y_balanced_encoded)

In [28]:
print(f"\nShape of balanced features (X_balanced): {X_balanced.shape}")
print(f"Length of balanced target (y_balanced_encoded): {len(y_balanced_encoded)}")



Shape of balanced features (X_balanced): (63388, 30)
Length of balanced target (y_balanced_encoded): 63388


# Model Training Phase

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced_encoded, test_size=0.25, random_state=42
)

In [31]:
X_train.shape , y_train.shape

((47541, 30), (47541,))

In [32]:
X_test.shape , y_test.shape

((15847, 30), (15847,))

In [33]:
## Training Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [34]:
rf_params = {
    "max_depth": [5, 8, 15, 20, None], # Added 20 for more variety
    "max_features": [5, 7, 'sqrt', 8], # Changed 'auto' to 'sqrt' which is the modern equivalent
    "min_samples_split": [2, 8, 15, 20],
    "n_estimators": [100, 200, 500, 1000]
}

In [35]:
rf_model = RandomForestClassifier(random_state=42)

In [36]:
rs_cv = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_params,
    n_iter=50, # You can increase this for a more exhaustive search
    cv=5,
    verbose=2, # This will print progress updates
    random_state=42,
    n_jobs=-1 # Use all available CPU cores
)

In [37]:
rs_cv.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [38]:
print(rs_cv.best_params_)

{'n_estimators': 100, 'min_samples_split': 20, 'max_features': 'sqrt', 'max_depth': None}


In [39]:
### Train on best params
model = RandomForestClassifier(
    n_estimators=rs_cv.best_params_['n_estimators'],
    max_depth=rs_cv.best_params_['max_depth'],
    max_features=rs_cv.best_params_['max_features'],
    min_samples_split=rs_cv.best_params_['min_samples_split'],
    random_state=42
)

model.fit(X_train, y_train)

### Evaluation

In [40]:
from sklearn.metrics import accuracy_score

In [41]:
y_pred_on_test = model.predict(X_test)
y_pred_on_train = model.predict(X_train)

In [42]:
accuracy_on_test = accuracy_score(y_test, y_pred_on_test)
accuracy_on_train = accuracy_score(y_train, y_pred_on_train)
print(f"Test Accuracy: {accuracy_on_test:.2%}")
print(f"train Accuracy: {accuracy_on_train:.2%}")

Test Accuracy: 58.40%
train Accuracy: 60.31%


# Deployment

In [43]:
import pickle

# Assume 'best_model' is the trained model from RandomizedSearchCV (rs_cv.best_estimator_)
# Assume 'le' is the LabelEncoder you fit on the disease names
# Assume 'features_to_keep' is the list of your 30 feature column names

# --- Step 1: Save the Model and the Label Encoder ---

# Save the trained model to a file
with open('disease_prediction_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the label encoder to a file
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)

print("Model and Label Encoder have been saved successfully.")


# --- Step 2: Create Dummy Data for a New Prediction ---

# This represents a new, unseen patient record.
# It MUST have the same columns the model was trained on.
# We'll create a dictionary and then convert it to a DataFrame.
# Let's say the patient has a headache, nausea, and weakness.

dummy_data_dict = {feature: [0] for feature in features_to_keep}
dummy_data_dict['headache'] = [1]
dummy_data_dict['nausea'] = [1]
dummy_data_dict['weakness'] = [1]
dummy_data_dict['side pain'] = [1]
dummy_data_dict['fever'] = [1]

dummy_df = pd.DataFrame(dummy_data_dict)

print("\n--- Created Dummy Patient Data ---")
print(dummy_df)


# --- Step 3: Create a Prediction Pipeline ---

print("\n--- Running Prediction Pipeline ---")

# In a real application, you would load these files, not use the variables.
# 1. Load the saved model
with open('disease_prediction_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# 2. Load the saved label encoder
with open('label_encoder.pkl', 'rb') as le_file:
    loaded_le = pickle.load(le_file)

# 3. Make a prediction on the new data
# The model predicts the ENCODED label (a number)
predicted_label_encoded = loaded_model.predict(dummy_df)
print(f"Model predicted encoded label: {predicted_label_encoded[0]}")

# 4. Decode the prediction back to the original disease name
predicted_disease_name = loaded_le.inverse_transform(predicted_label_encoded)
print(f"\nFinal Predicted Disease: {predicted_disease_name[0]}")

Model and Label Encoder have been saved successfully.

--- Created Dummy Patient Data ---
   headache  burning abdominal pain  vomiting  pelvic pain  back pain  \
0         1                       0         0            0          0   

   disturbance of memory  vomiting blood  sharp abdominal pain  nausea  \
0                      0               0                     0       1   

   ache all over  ...  fever  rectal bleeding  lacrimation  \
0              0  ...      1                0            0   

   decreased appetite  leg pain  restlessness  constipation  \
0                   0         0             0             0   

   vaginal discharge  side pain  skin moles  
0                  0          1           0  

[1 rows x 30 columns]

--- Running Prediction Pipeline ---
Model predicted encoded label: 28

Final Predicted Disease: liver disease
