In [37]:
# ===============================================================
# 1. IMPORTS
# ===============================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# ===============================================================
# 2. LOAD DATA
# ===============================================================
df = pd.read_csv("healthcare_dataset_cleaned.csv")
print("✅ Data loaded:", df.shape)


# ===============================================================
# 3. DATE FEATURE ENGINEERING
# ===============================================================
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])

df["Length of Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days
df["Admission Month"] = df["Date of Admission"].dt.month
df["Admission Day"] = df["Date of Admission"].dt.day
df["Admission Weekday"] = df["Date of Admission"].dt.weekday
df["Discharge Month"] = df["Discharge Date"].dt.month
df["Discharge Weekday"] = df["Discharge Date"].dt.weekday

print("✅ Date features added.")


# ===============================================================
# 4. DROP RAW TEXT COLUMNS
# ===============================================================
drop_cols = [
    "Name", "Doctor", "Hospital", "Medication",
    "Test Results", "Date of Admission", "Discharge Date"
]

df_model = df.drop(columns=[c for c in drop_cols if c in df.columns])
print("✅ Clean dataset:", df_model.shape)


# ===============================================================
# 5. IDENTIFY CATEGORICAL COLUMNS
# ===============================================================
cat_cols = df_model.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", cat_cols)


# ===============================================================
# 6. LABEL ENCODE LOW-CARDINALITY COLUMNS
# ===============================================================
label_encode_cols = [col for col in cat_cols if df_model[col].nunique() <= 10]
target_encode_cols = [col for col in cat_cols if col not in label_encode_cols]

le_map = {}  # To store label encoders
for col in label_encode_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_map[col] = le

print("✅ Label encoded:", label_encode_cols)


# ===============================================================
# 7. TARGET ENCODE HIGH-CARDINALITY COLUMNS
# ===============================================================
target_maps = {}

for col in target_encode_cols:
    means = df_model.groupby(col)["Billing Amount"].mean()
    df_model[col] = df_model[col].map(means)
    target_maps[col] = means

print("✅ Target encoded:", target_encode_cols)


# ===============================================================
# 8. FEATURES AND TARGET
# ===============================================================
y = df_model["Billing Amount"]
X = df_model.drop(columns=["Billing Amount"])

print("✅ Training dataset ready:", X.shape)


# ===============================================================
# 9. TRAIN MODEL
# ===============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.07,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)
print("✅ Model trained successfully!")


# ===============================================================
# 10. FIXED NEW-PATIENT PREDICTION FUNCTION
# ===============================================================
def predict_bill(
    age, gender, blood_type, medical_condition,
    admission_type, insurance_provider,
    admission_date, discharge_date
):
    ad = pd.to_datetime(admission_date)
    dd = pd.to_datetime(discharge_date)

    los = (dd - ad).days
    adm_month = ad.month
    adm_day = ad.day
    adm_weekday = ad.weekday()
    dis_month = dd.month
    dis_weekday = dd.weekday()

    patient = {
        "Age": age,
        "Length of Stay": los,
        "Admission Month": adm_month,
        "Admission Day": adm_day,
        "Admission Weekday": adm_weekday,
        "Discharge Month": dis_month,
        "Discharge Weekday": dis_weekday,
        
    }

    # Process categorical columns safely
    for col, value in {
        "Gender": gender,
        "Blood Type": blood_type,
        "Admission Type": admission_type,
        "Medical Condition": medical_condition,
        "Insurance Provider": insurance_provider
    }.items():

        # If label encoded → use its encoder
        if col in label_encode_cols:
            patient[col] = le_map[col].transform([value])[0]

        # If target encoded → map using means
        elif col in target_encode_cols:
            patient[col] = target_maps[col].get(value, target_maps[col].mean())

        else:
            patient[col] = 0  # fallback

    new_df = pd.DataFrame([patient])
    new_df = new_df.reindex(columns=X.columns, fill_value=0)

    return model.predict(new_df)[0]


print("✅ Billing prediction system ready!")




✅ Data loaded: (54966, 15)
✅ Date features added.
✅ Clean dataset: (54966, 14)
Categorical columns: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']
✅ Label encoded: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']
✅ Target encoded: []
✅ Training dataset ready: (54966, 13)
✅ Model trained successfully!
✅ Billing prediction system ready!


In [38]:
# ===============================================================
# 11. TEST WITH SAMPLE NEW PATIENT
# ===============================================================
bill = predict_bill(
    age=30,
    gender="Male",
    blood_type="B-",
    medical_condition="Cancer",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2024-02-02",
    
)

print("\n✅ Estimated Final Bill:", bill)



✅ Estimated Final Bill: 30955.727


In [40]:
# ===============================================================
# 11. TEST WITH SAMPLE NEW PATIENT
# ===============================================================
bill = predict_bill(
    age=30,
    gender="Male",
    blood_type="B-",
    medical_condition="Cancer",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2026-08-28",
    
)

print("\n✅ Estimated Final Bill:", bill)



✅ Estimated Final Bill: 22720.904
