In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


In [49]:
import os

# List everything in the input directory
print("Contents of /kaggle/input/:")
print(os.listdir('/kaggle/input/'))

# If you see a subfolder (e.g. 'hospital-data' or similar), list its contents too
# Example — replace 'your-dataset-slug' with what you see above
if os.path.exists('/kaggle/input/your-dataset-slug'):
    print("\nFiles inside that folder:")
    print(os.listdir('/kaggle/input/your-dataset-slug'))

Contents of /kaggle/input/:
['datasets']


In [50]:
df = pd.read_csv('/kaggle/input/datasets/subha018/hospital-data/hospital_dataset_final.csv')

In [51]:
df.head()

Unnamed: 0,Patient_ID,Patient_Name,Age,Gender,Height_cm,Weight_kg,City,Disease,Pulse_Rate,Oxygen_Level,...,Doctor_Specialization,Appointment_Type,Admit_Type,Days,Visits,Cost,Insurance,Outcome,Readmitted,Status
0,1,Rahul Das,45,Male,184,58,Kolkata,Diabetes,69,98,...,Endocrinologist,Follow-up,Emergency,7,4,45000.0,Yes,Recovered,No,Alive
1,2,Ananya Roy,30,Female,163,67,Delhi,Flu,87,99,...,General Physician,Follow-up,Normal,3,1,8000.0,No,Recovered,No,Alive
2,3,Amit Sharma,60,Male,146,94,Mumbai,Heart,125,92,...,Cardiologist,Emergency,Emergency,10,6,95000.0,Yes,Not Recovered,Yes,Deceased
3,4,Sneha Paul,50,Female,154,80,Chennai,Diabetes,63,96,...,Endocrinologist,Follow-up,Normal,5,3,30000.0,Yes,Recovered,No,Alive
4,5,Rohan Mehta,40,Male,156,59,Pune,Asthma,77,95,...,Pulmonologist,Follow-up,Normal,4,2,20000.0,No,Recovered,No,Alive


In [52]:
print("Shape:-",df.shape)
print("Size:-",df.size)
print("Dimensions:-",df.ndim)
print("Column Names:-",df.columns.to_list())
# print("Index:-",df.index.to_list())
print("Data Types:-\n",df.dtypes)
print("\nInfo:-")
print(df.info())

Shape:- (10000, 28)
Size:- 280000
Dimensions:- 2
Column Names:- ['Patient_ID', 'Patient_Name', 'Age', 'Gender', 'Height_cm', 'Weight_kg', 'City', 'Disease', 'Pulse_Rate', 'Oxygen_Level', 'BP', 'Sugar', 'Chol', 'Smoke', 'Alcohol', 'Chronic_Disease', 'Stress_Level', 'ICU_Required', 'Doctor_Specialization', 'Appointment_Type', 'Admit_Type', 'Days', 'Visits', 'Cost', 'Insurance', 'Outcome', 'Readmitted', 'Status']
Data Types:-
 Patient_ID                 int64
Patient_Name              object
Age                        int64
Gender                    object
Height_cm                  int64
Weight_kg                  int64
City                      object
Disease                   object
Pulse_Rate                 int64
Oxygen_Level               int64
BP                         int64
Sugar                      int64
Chol                       int64
Smoke                     object
Alcohol                   object
Chronic_Disease           object
Stress_Level              object
ICU_Require

In [53]:
df["Disease"].unique()


array(['Diabetes', 'Flu', 'Heart', 'Asthma', 'Thyroid', 'Dengue',
       'Chicken Pox', nan, 'Tuberculosis', 'Malaria', 'Conjunctivitis',
       'Cholera', 'Pneumonia', 'Covid-19', 'Typhoid', 'Hepatitis A',
       'Appendicitis', 'Leukemia', 'Hepatitis B', 'Arthritis',
       'Diptheria', 'Breast Cancer', 'Hypertension', 'Jaundice',
       'Hepatitis C', 'Gastroenteritis', 'UTI', 'Food Poisoning',
       'Migraine', 'Anemia', 'Viral Fever', 'Kidney Stone', 'Parkinson’s',
       'Epilepsy', 'Bronchitis', 'Skin Infection', 'Pancreatitis',
       'Stroke', 'Sinusitis', 'COPD', 'Osteoporosis', 'Cirrhosis',
       'Hepatitis', 'Measles', 'Eczema', 'Gastritis', 'Psoriasis',
       'Malnutrition', 'Coronary Artery Disease', 'Vitamin D Deficiency',
       'Alzheimer’s', 'Chronic Kidney Disease', 'Thyroid Disorder',
       'Heart Disease'], dtype=object)

In [54]:
# make sure disease column is cleaned
df["Disease"] = (
    df["Disease"]
    .astype(str)
    .str.strip()
    .str.lower()
)

# define keyword clusters
cluster_keywords = {

    # Metabolic & Nutritional
    "Metabolic": [
        "diabetes", "thyroid", "vitamin", "malnutrition",
        "anemia", "osteoporosis"
    ],

    # Cardiac & Vascular
    "Cardiac": [
        "heart", "cardiac", "coronary", "hypertension",
        "stroke"
    ],

    # Respiratory
    "Respiratory": [
        "asthma", "copd", "pneumonia", "bronchitis",
        "tuberculosis", "covid", "sinusitis"
    ],

    # Neurological
    "Neurological": [
        "epilepsy", "parkinson", "alzheimer",
        "migraine"
    ],

    # Infectious Diseases
    "Infection": [
        "flu", "dengue", "cholera", "typhoid",
        "malaria", "viral", "chicken pox",
        "diptheria", "measles", "conjunctivitis",
        "skin infection", "uti"
    ],

    # Gastrointestinal & Liver
    "Gastro": [
        "gastritis", "gastroenteritis", "appendicitis",
        "pancreatitis", "food poisoning",
        "hepatitis", "jaundice", "cirrhosis"
    ],

    # Cancer
    "Cancer": [
        "cancer", "leukemia"
    ],

    # Renal
    "Renal": [
        "kidney", "renal"
    ],

    # Autoimmune / Skin
    "Autoimmune/Skin": [
        "arthritis", "eczema", "psoriasis"
    ]
}


def assign_cluster(disease):
    for cluster, keywords in cluster_keywords.items():
        for keyword in keywords:
            if keyword in disease:
                return cluster
    return "Other"

df["Disease_Cluster"] = df["Disease"].apply(assign_cluster)

df["Disease_Cluster"].value_counts()


Disease_Cluster
Infection          2605
Gastro             1886
Respiratory        1680
Metabolic          1276
Cardiac             823
Neurological        613
Autoimmune/Skin     499
Renal               308
Cancer              307
Other                 3
Name: count, dtype: int64

In [55]:
missing_values = df.isnull().sum().sum()
print("Total Missing values:-",missing_values)
print("Total Cells :-",df.size)
print("Missing Cells Percentage:-",(missing_values/df.size *100).round(4))

Total Missing values:- 288
Total Cells :- 290000
Missing Cells Percentage:- 0.0993


In [56]:
print("\\"+"="*80)
print(" Missing Values by Columns :-")
print("="*80)

missing_by_columns = pd.DataFrame({
    'column': df.columns,
    'missing_count': df.isnull().sum().values,
    'missing_percentage': (df.isnull().sum().values/len(df)*100).round(4),
    'data_type': df.dtypes.values,
    'non_null_count': df.notnull().sum().values
})
print(missing_by_columns)

 Missing Values by Columns :-
                   column  missing_count  missing_percentage data_type  \
0              Patient_ID              0                0.00     int64   
1            Patient_Name              3                0.03    object   
2                     Age              0                0.00     int64   
3                  Gender              4                0.04    object   
4               Height_cm              0                0.00     int64   
5               Weight_kg              0                0.00     int64   
6                    City              5                0.05    object   
7                 Disease              0                0.00    object   
8              Pulse_Rate              0                0.00     int64   
9            Oxygen_Level              0                0.00     int64   
10                     BP              0                0.00     int64   
11                  Sugar              0                0.00     int64   
12      

In [57]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

cat_feature_cols = df.select_dtypes(include='object').columns
for col in cat_feature_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)
print("After Filling The Missing values :--")
print(df.isnull().sum())

After Filling The Missing values :--
Patient_ID               0
Patient_Name             0
Age                      0
Gender                   0
Height_cm                0
Weight_kg                0
City                     0
Disease                  0
Pulse_Rate               0
Oxygen_Level             0
BP                       0
Sugar                    0
Chol                     0
Smoke                    0
Alcohol                  0
Chronic_Disease          0
Stress_Level             0
ICU_Required             0
Doctor_Specialization    0
Appointment_Type         0
Admit_Type               0
Days                     0
Visits                   0
Cost                     0
Insurance                0
Outcome                  0
Readmitted               0
Status                   0
Disease_Cluster          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [58]:
features = [
    "Age", "Height_cm", "Weight_kg",
    "BP", "Sugar", "Chol",
    "Pulse_Rate", "Oxygen_Level",
    "Days", "Visits", "Cost",
    "Chronic_Disease", "Stress_Level"
]


In [59]:
target_cols = [
    "ICU_Required",
    "Stress_Level",
    "Chronic_Disease",
    "Doctor_Specialization",
    "Appointment_Type"
]


In [60]:
label_encoders = {}

for col in cat_feature_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

for col in target_cols:
    if col not in label_encoders:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le


In [61]:
X = df[features]
y_icu = df['ICU_Required']

X_train, X_test, y_train, y_test = train_test_split(
    X, y_icu, test_size=0.2, stratify= y_icu, random_state=42
)

model_icu = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42
)

model_icu.fit(X_train, y_train)

print("ICU Prediction Accuracy :-",
      accuracy_score(y_test,model_icu.predict(X_test)))

ICU Prediction Accuracy :- 1.0


In [62]:
y_risk = df["Stress_Level"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_risk, test_size=0.2, stratify=y_risk, random_state=42
)

model_risk = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42
)

model_risk.fit(X_train, y_train)

print("Risk Level Accuracy:",
      accuracy_score(y_test, model_risk.predict(X_test)))


Risk Level Accuracy: 1.0


In [63]:
y_chronic = df["Chronic_Disease"]

model_chronic = RandomForestClassifier(
    n_estimators=150,
    random_state=42
)

model_chronic.fit(X, y_chronic)

print("Chronic Disease Model Trained")


Chronic Disease Model Trained


In [64]:
y_doctor = df["Doctor_Specialization"]

model_doctor = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    random_state=42
)

model_doctor.fit(X, y_doctor)

print("Doctor Recommendation Model Trained")


Doctor Recommendation Model Trained


In [65]:
features_appointment = features + ["ICU_Required"]

X_app = df[features_appointment]
y_appointment = df["Appointment_Type"]

model_appointment = RandomForestClassifier(
    n_estimators=150,
    random_state=42
)

model_appointment.fit(X_app, y_appointment)

print("Appointment Type Model Trained")


Appointment Type Model Trained


In [66]:
X = df[features]
y_outcome = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
    X, y_outcome, test_size=0.2, stratify=y_outcome, random_state=42
)

model_outcome = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)
model_outcome.fit(X_train, y_train)

print("Outcome Model Accuracy :--",accuracy_score(y_test, model_outcome.predict(X_test)))

Outcome Model Accuracy :-- 0.7195


In [67]:
y_status = df['Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y_status, test_size=0.2, stratify=y_status, random_state=42
)

model_status = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
model_status.fit(X_train, y_train)

print("Status Model Accuracy :--",accuracy_score(y_test, model_status.predict(X_test)))

Status Model Accuracy :-- 0.663


In [68]:
y_readmit = df['Readmitted']

X_train, X_test, y_train, y_test = train_test_split(
    X, y_readmit, test_size=0.2, stratify=y_readmit, random_state=42
)

model_readmit = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model_readmit.fit(X_train, y_train)

print("Readmission Model Accuracy:",
      accuracy_score(y_test, model_readmit.predict(X_test)))


Readmission Model Accuracy: 0.6435


In [69]:
y_cluster = df['Disease_Cluster']

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cluster, test_size=0.2, stratify=y_cluster, random_state=42
)
cluster_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    class_weight="balanced",
    random_state=42,
    n_jobs = -1
)

cluster_model.fit(X, y_cluster)

print("Random Forest Disease Classification Model Trained")
print("Random Forest Disease Model Accuracy:",
      accuracy_score(y_test, cluster_model.predict(X_test)))


Random Forest Disease Classification Model Trained
Random Forest Disease Model Accuracy: 0.7275


In [70]:
cluster_disease_models = {}

for cluster in df["Disease_Cluster"].unique():
    sub = df[df["Disease_Cluster"] == cluster]

    if sub["Disease"].nunique() < 2:
        continue

    X_c = sub[features]
    y_c = sub["Disease"]

    model = RandomForestClassifier(
        n_estimators=150,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_c, y_c)

    cluster_disease_models[cluster] = model

print("Trained disease models for clusters:",
      list(cluster_disease_models.keys()))


Trained disease models for clusters: [np.int64(5), np.int64(4), np.int64(2), np.int64(9), np.int64(3), np.int64(1), np.int64(0), np.int64(6), np.int64(8)]


In [71]:
from sklearn.ensemble import RandomForestRegressor

y_recovery = df['Days']

model_recovery_days = RandomForestRegressor(
    n_estimators=150,
    random_state=42
)

model_recovery_days.fit(X, y_recovery)

print("Recovery Days Regression Model Trained")


Recovery Days Regression Model Trained


In [72]:
def get_user_input():

    print("\nEnter Patient Information:")

    age = int(input("Age: "))
    height = float(input("Height (cm): "))
    weight = float(input("Weight (kg): "))
    bp = float(input("Blood Pressure: "))
    sugar = float(input("Sugar Level: "))
    chol = float(input("Cholesterol Level: "))
    days = int(input("Days in Hospital: "))
    visits = int(input("Number of Visits: "))
    cost = float(input("Cost of Treatment: "))


    height_m = height / 100
    bmi = round(weight / (height_m ** 2), 2)

    if bp > 140 or sugar > 140:
        pulse_rate = random.randint(90, 120)
    else:
        pulse_rate = random.randint(60, 90)

    if bp > 150:
        oxygen = random.randint(88, 94)
    else:
        oxygen = random.randint(95, 100)


    chronic = 1 if (sugar > 140 or bp > 140) else 0


    if oxygen < 92 or chronic == 1:
        stress = 2
    elif bp > 130:
        stress = 1
    else:
        stress = 0

    input_df = pd.DataFrame([{
        "Age": age,
        "Height_cm": height,
        "Weight_kg": weight,
        "BP": bp,
        "Sugar": sugar,

        "Chol": chol,
        "Pulse_Rate": pulse_rate,
        "Oxygen_Level": oxygen,
        "Days": days,
        "Visits": visits,
        "Cost": cost,
        "Chronic_Disease": chronic,
        "Stress_Level": stress
    }])

    return input_df


In [73]:
def health_risk_score(bp, sugar, chol, age, pulse, oxygen, chronic):
    score = 0

    # Blood pressure
    if bp > 140:
        score += 2

    # Sugar
    if sugar > 140:
        score += 2

    # Cholesterol
    if chol > 200:
        score += 1

    # Age factor
    if age > 60:
        score += 2

    # Pulse rate
    if pulse > 100:
        score += 1

    # Oxygen saturation
    if oxygen < 92:
        score += 2
    elif oxygen < 95:
        score += 1

    # Chronic disease
    if chronic == 1:
        score += 2

    # Final Risk
    if score <= 3:
        return "Low Risk"
    elif score <= 7:
        return "Medium Risk"
    else:
        return "High Risk"


In [74]:
def ensemble_disease_prediction(user_df):
    cluster = cluster_model.predict(user_df)[0]
    cluster_prob = float(max(cluster_model.predict_proba(user_df)[0]))

    disease_model = cluster_disease_models.get(cluster)

    if disease_model:
        probs = disease_model.predict_proba(user_df)[0]
        idx = np.argsort(probs)[-3:][::-1]
        top3 = [
            {
                "disease": int(disease_model.classes_[i]),
                "probability": round(probs[i]*100, 2)
            }
            for i in idx
        ]
        final_disease = int(top3[0]["disease"])
    else:
        final_disease = None
        top3 = []

    return final_disease, cluster, cluster_prob, top3


In [75]:
def predict_all():

    # -------------------------
    # GET USER INPUT
    # -------------------------
    user_data = get_user_input()

    # -------------------------
    # MODEL PREDICTIONS
    # -------------------------
    outcome_pred = model_outcome.predict(user_data)[0]
    outcome_prob = model_outcome.predict_proba(user_data)[0]

    status_pred = model_status.predict(user_data)[0]
    status_prob = model_status.predict_proba(user_data)[0]

    readmit_pred = model_readmit.predict(user_data)[0]

    final_disease, cluster, cluster_prob, top3_diseases =(
        ensemble_disease_prediction(user_data)
    )

    recovery_pred = model_recovery_days.predict(user_data)[0]

    # -------------------------
    # HEALTH RISK (UPDATED)
    # -------------------------
    risk = health_risk_score(
        bp=user_data["BP"].iloc[0],
        sugar=user_data["Sugar"].iloc[0],
        chol=user_data["Chol"].iloc[0],
        age=user_data["Age"].iloc[0],
        pulse=user_data["Pulse_Rate"].iloc[0],
        oxygen=user_data["Oxygen_Level"].iloc[0],
        chronic=user_data["Chronic_Disease"].iloc[0]
    )

    # -------------------------
    # OUTPUT
    # -------------------------
    print("\n===== HEALTH PREDICTION REPORT =====")
    print("Overall Health Risk:", risk)

    outcome_res = label_encoders["Outcome"].inverse_transform([outcome_pred])[0]
    print("\nOutcome:",
          outcome_res,
          f"(Confidence: {round(max(outcome_prob)*100, 2)}%)")


    status_res = label_encoders["Status"].inverse_transform([status_pred])[0]
    print("Status:",
          status_res,
          f"(Confidence: {round(max(status_prob)*100, 2)}%)")


    readmit_res = label_encoders["Readmitted"].inverse_transform([readmit_pred])[0]
    print("Readmitted:",readmit_res)

    cluster_name = label_encoders["Disease_Cluster"].inverse_transform([cluster])[0]
    print(f"\nPredicted Disease Cluster: {cluster_name} "
          f"({round(cluster_prob*100,2)}%)")

    final_disease_name = label_encoders["Disease"].inverse_transform(
        [final_disease]
    )[0]
    print("\nFinal Predicted Disease:", final_disease_name)

    print("\nTop 3 Possible Diseases:")
    for item in top3_diseases:
        disease_name = label_encoders["Disease"].inverse_transform(
            [item["disease"]]
        )[0]
        print(f"- {disease_name}: {item['probability']}%")

    print("\nExpected Days of Recovery:",
          round(float(recovery_pred), 1), "days")



In [76]:
predict_all()


Enter Patient Information:


Age:  23
Height (cm):  170
Weight (kg):  76
Blood Pressure:  103
Sugar Level:  126
Cholesterol Level:  178
Days in Hospital:  3
Number of Visits:  


ValueError: invalid literal for int() with base 10: ''

In [77]:
all_models = {
    "outcome_model": model_outcome,
    "status_model": model_status,
    "readmit_model": model_readmit,
    "cluster_model": cluster_model,
    "cluster_disease_models": cluster_disease_models,
    "recovery_model": model_recovery_days,
    "label_encoders": label_encoders
}

joblib.dump(all_models, "health_models.pkl", compress=3)

print("All models saved successfully in compressed format!")


All models saved successfully in compressed format!


In [78]:
import time
start = time.time()
models = joblib.load("health_models.pkl")
print("Load time:", time.time() - start)


Load time: 1.17435622215271
