In [3]:
import pandas as pd

df = pd.read_csv("healthcare_dataset_cleaned.csv")

print("✅ Data loaded:")
print(df.shape)
print(df.head())


✅ Data loaded:
(54966, 15)
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medicati

In [4]:
# ==========================================================
# CLEAN FEATURE SET FOR BILLING PREDICTION
# ==========================================================

# Drop columns that should NOT be used for ML
drop_cols = [
    "Name",
    "Doctor",
    "Hospital",
    "Medication",
    "Test Results",
    "Date of Admission",
    "Discharge Date"
]

df_model = df.drop(columns=[col for col in drop_cols if col in df.columns])

print("✅ Clean model dataset shape:", df_model.shape)
print(df_model.head())


✅ Clean model dataset shape: (54966, 8)
   Age  Gender Blood Type Medical Condition Insurance Provider  \
0   30    Male         B-            Cancer         Blue Cross   
1   62    Male         A+           Obesity           Medicare   
2   76  Female         A-           Obesity              Aetna   
3   28  Female         O+          Diabetes           Medicare   
4   43  Female        AB+            Cancer              Aetna   

   Billing Amount  Room Number Admission Type  
0    18856.281306          328         Urgent  
1    33643.327287          265      Emergency  
2    27955.096079          205      Emergency  
3    37909.782410          450       Elective  
4    14238.317814          458         Urgent  


In [7]:
cat_cols = df_model.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", cat_cols)


Categorical columns: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']


In [8]:
from sklearn.preprocessing import LabelEncoder

label_encode_cols = [col for col in cat_cols if df_model[col].nunique() <= 10]

le_map = {}  # To reuse during prediction

for col in label_encode_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_map[col] = le

print("✅ Label Encoded:", label_encode_cols)


✅ Label Encoded: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']


In [9]:
target_encode_cols = [col for col in cat_cols if col not in label_encode_cols]

for col in target_encode_cols:
    means = df_model.groupby(col)["Billing Amount"].mean()
    df_model[col] = df_model[col].map(means)

print("✅ Target Encoded:", target_encode_cols)


✅ Target Encoded: []


In [10]:
y = df_model["Billing Amount"]
X = df_model.drop(columns=["Billing Amount"])

print("✅ Final dataset ready for training:", X.shape)


✅ Final dataset ready for training: (54966, 7)


In [5]:
# Target
y = df_model["Billing Amount"]

# Features
X = df_model.drop(columns=["Billing Amount"])

print("✅ X and y created:", X.shape, y.shape)


✅ X and y created: (54966, 7) (54966,)


In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.07,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

print("✅ MODEL TRAINED SUCCESSFULLY ✅")


✅ MODEL TRAINED SUCCESSFULLY ✅


In [12]:
# ================================================================
# BUILD SAFE TARGET ENCODING MAPS
# ================================================================

# For Medical Condition (target encoding)
mc_mean_map = df_model.groupby("Medical Condition")["Billing Amount"].mean()

# For Insurance Provider (target encoding)
ip_mean_map = df_model.groupby("Insurance Provider")["Billing Amount"].mean()

print("✅ Target encoding maps ready.")


✅ Target encoding maps ready.


In [13]:
# ================================================================
# ✅ FINAL NEW PATIENT BILL PREDICTION FUNCTION
# ================================================================

def predict_bill(
    age,
    gender,              # string e.g. "Male"
    blood_type,          # string e.g. "B-"
    medical_condition,   # string e.g. "Cancer"
    admission_type,      # string e.g. "Urgent"
    insurance_provider,  # string e.g. "Blue Cross"
    admission_date,      # "YYYY-MM-DD"
    discharge_date,      # "YYYY-MM-DD"
    room_number
):
    # ----------------------------------------------------------
    # Calculate Length of Stay
    # ----------------------------------------------------------
    los = (
        pd.to_datetime(discharge_date) -
        pd.to_datetime(admission_date)
    ).days

    # ----------------------------------------------------------
    # Handle label-encoded columns
    # (gender, blood_type, admission_type)
    # ----------------------------------------------------------
    encoded_gender = le_map["Gender"].transform([gender])[0]
    encoded_blood = le_map["Blood Type"].transform([blood_type])[0]
    encoded_adm   = le_map["Admission Type"].transform([admission_type])[0]

    # ----------------------------------------------------------
    # Handle target-encoded columns
    # (medical_condition, insurance_provider)
    # ----------------------------------------------------------
    mc_encoded = mc_mean_map.get(medical_condition, mc_mean_map.mean())
    ip_encoded = ip_mean_map.get(insurance_provider, ip_mean_map.mean())

    # ----------------------------------------------------------
    # Build final input row (MUST MATCH X COLUMNS)
    # ----------------------------------------------------------
    patient = {
        "Age": age,
        "Gender": encoded_gender,
        "Blood Type": encoded_blood,
        "Medical Condition": mc_encoded,
        "Insurance Provider": ip_encoded,
        "Admission Type": encoded_adm,
        "Length of Stay": los,
        "Room Number": room_number
    }

    new_df = pd.DataFrame([patient])
    new_df = new_df.reindex(columns=X.columns, fill_value=0)

    # ----------------------------------------------------------
    # Predict using trained model
    # ----------------------------------------------------------
    prediction = model.predict(new_df)[0]

    return prediction


In [16]:
bill = predict_bill(
    age=30,
    gender="Male",
    blood_type="B-",
    medical_condition="Obesity",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2024-02-02",
    room_number=328
)

print("✅ Estimated Final Bill:", bill)


✅ Estimated Final Bill: 25240.518


In [18]:
bill = predict_bill(
    age=30,
    gender="Female",
    blood_type="A+",
    medical_condition="Obesity",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2024-02-02",
    room_number=328
)

print("✅ Estimated Final Bill:", bill)


✅ Estimated Final Bill: 30045.7


In [38]:

bill = predict_bill(
    age=60,
    gender="Male",
    blood_type="AB+",
    medical_condition="Diabetes",
    admission_type="Emergency",
    insurance_provider="Medicare",
    admission_date="2024-01-31",
    discharge_date="2024-02-28",
    room_number=328
)

print("✅ Estimated Final Bill:", bill)

✅ Estimated Final Bill: 25947.055


In [41]:
bill = predict_bill(
    age=65,
    gender="Male",
    blood_type="O+",
    medical_condition="Diabetes",
    admission_type="Emergency",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2024-02-01",
    room_number=328
)

print("✅ Estimated Final Bill:", bill)

✅ Estimated Final Bill: 23665.318
