In [1]:
import pandas as pd

df = pd.read_csv("healthcare_dataset_cleaned.csv")

print("✅ Data loaded:")
print(df.shape)
print(df.head())


✅ Data loaded:
(54966, 15)
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medicati

In [2]:
# ============================================================
# ADD DATE FEATURES BEFORE DROPPING RAW DATE COLUMNS
# ============================================================

df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])

# Length of Stay
df["Length of Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days

# Admission Features
df["Admission Month"] = df["Date of Admission"].dt.month
df["Admission Day"] = df["Date of Admission"].dt.day
df["Admission Weekday"] = df["Date of Admission"].dt.weekday  # Mon=0

# Discharge Features
df["Discharge Month"] = df["Discharge Date"].dt.month
df["Discharge Weekday"] = df["Discharge Date"].dt.weekday

print("✅ Date features added.")


✅ Date features added.


In [3]:
drop_cols = [
    "Name", "Doctor", "Hospital", "Medication", "Test Results",
    "Date of Admission", "Discharge Date"
]

df_model = df.drop(columns=drop_cols)


In [5]:
# ============================================================
# ADD DATE FEATURES BEFORE DROPPING RAW DATE COLUMNS
# ============================================================

df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])

# Length of Stay
df["Length of Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days

# Admission Features
df["Admission Month"] = df["Date of Admission"].dt.month
df["Admission Day"] = df["Date of Admission"].dt.day
df["Admission Weekday"] = df["Date of Admission"].dt.weekday  # Mon=0

# Discharge Features
df["Discharge Month"] = df["Discharge Date"].dt.month
df["Discharge Weekday"] = df["Discharge Date"].dt.weekday

print("✅ Date features added.")


✅ Date features added.


In [6]:
cat_cols = df_model.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", cat_cols)


Categorical columns: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']


In [8]:
# ===============================================================
# 1. IMPORTS
# ===============================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# ===============================================================
# 2. LOAD DATA
# ===============================================================
df = pd.read_csv("healthcare_dataset_cleaned.csv")
print("✅ Data loaded:", df.shape)


# ===============================================================
# 3. DATE FEATURE ENGINEERING
# ===============================================================
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])

# Length of stay
df["Length of Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days

# Admission date features
df["Admission Month"] = df["Date of Admission"].dt.month
df["Admission Day"] = df["Date of Admission"].dt.day
df["Admission Weekday"] = df["Date of Admission"].dt.weekday

# Discharge date features
df["Discharge Month"] = df["Discharge Date"].dt.month
df["Discharge Weekday"] = df["Discharge Date"].dt.weekday

print("✅ Date features added.")


# ===============================================================
# 4. DROP RAW TEXT COLUMNS (UNUSABLE FOR ML)
# ===============================================================
drop_cols = [
    "Name",
    "Doctor",
    "Hospital",
    "Medication",
    "Test Results",
    "Date of Admission",
    "Discharge Date"
]

df_model = df.drop(columns=[c for c in drop_cols if c in df.columns])
print("✅ Columns dropped. New shape:", df_model.shape)


# ===============================================================
# 5. IDENTIFY CATEGORICAL COLUMNS
# ===============================================================
cat_cols = df_model.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", cat_cols)


# ===============================================================
# 6. LABEL ENCODE LOW-CARDINALITY CATEGORIES
# ===============================================================
label_encode_cols = [col for col in cat_cols if df_model[col].nunique() <= 10]
le_map = {}

for col in label_encode_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_map[col] = le

print("✅ Label encoded:", label_encode_cols)


# ===============================================================
# 7. TARGET ENCODE HIGH-CARDINALITY CATEGORIES
# ===============================================================
target_encode_cols = [col for col in cat_cols if col not in label_encode_cols]

mc_mean_map = None
ip_mean_map = None

for col in target_encode_cols:
    means = df_model.groupby(col)["Billing Amount"].mean()
    df_model[col] = df_model[col].map(means)

    if col == "Medical Condition":
        mc_mean_map = means

    if col == "Insurance Provider":
        ip_mean_map = means

print("✅ Target encoded:", target_encode_cols)


# ===============================================================
# 8. BUILD FEATURES AND TARGET
# ===============================================================
y = df_model["Billing Amount"]
X = df_model.drop(columns=["Billing Amount"])

print("✅ Final ML dataset:", X.shape, y.shape)


# ===============================================================
# 9. TRAIN XGBOOST MODEL
# ===============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.07,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)
print("✅ Model trained successfully!")


# ===============================================================
# 10. FINAL NEW-PATIENT BILLING PREDICTION FUNCTION
# ===============================================================
def predict_bill(
    age,
    gender,              # "Male", "Female"
    blood_type,          # "B-", "A+", etc.
    medical_condition,   # "Cancer", "Diabetes", etc.
    admission_type,      # "Emergency", "Urgent"
    insurance_provider,  # "Blue Cross", etc.
    admission_date,      # "YYYY-MM-DD"
    discharge_date,      # "YYYY-MM-DD"
    room_number
):
    # Convert dates
    ad = pd.to_datetime(admission_date)
    dd = pd.to_datetime(discharge_date)

    # Date features
    los = (dd - ad).days
    admission_month = ad.month
    admission_day = ad.day
    admission_weekday = ad.weekday()
    discharge_month = dd.month
    discharge_weekday = dd.weekday()

    # Label encoding
    encoded_gender = le_map["Gender"].transform([gender])[0]
    encoded_blood = le_map["Blood Type"].transform([blood_type])[0]
    encoded_adm = le_map["Admission Type"].transform([admission_type])[0]

    # Target encoding for high-cardinality
    mc_value = mc_mean_map.get(medical_condition, mc_mean_map.mean())
    ip_value = ip_mean_map.get(insurance_provider, ip_mean_map.mean())

    # Final patient row
    patient = {
        "Age": age,
        "Gender": encoded_gender,
        "Blood Type": encoded_blood,
        "Medical Condition": mc_value,
        "Insurance Provider": ip_value,
        "Admission Type": encoded_adm,
        "Length of Stay": los,
        "Room Number": room_number,
        "Admission Month": admission_month,
        "Admission Day": admission_day,
        "Admission Weekday": admission_weekday,
        "Discharge Month": discharge_month,
        "Discharge Weekday": discharge_weekday
    }

    new_df = pd.DataFrame([patient])
    new_df = new_df.reindex(columns=X.columns, fill_value=0)

    return model.predict(new_df)[0]


print("✅ Billing prediction system ready!")


# ===============================================================
# 11. TEST WITH SAMPLE PATIENT
# ===============================================================
bill = predict_bill(
    age=30,
    gender="Male",
    blood_type="B-",
    medical_condition="Cancer",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2024-02-02",
    room_number=328
)

print("✅ Estimated Final Bill:", bill)


✅ Data loaded: (54966, 15)
✅ Date features added.
✅ Columns dropped. New shape: (54966, 14)
Categorical columns: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']
✅ Label encoded: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']
✅ Target encoded: []
✅ Final ML dataset: (54966, 13) (54966,)
✅ Model trained successfully!
✅ Billing prediction system ready!


AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
# ===============================================================
# 1. IMPORTS
# ===============================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# ===============================================================
# 2. LOAD DATA
# ===============================================================
df = pd.read_csv("healthcare_dataset_cleaned.csv")
print("✅ Data loaded:", df.shape)


# ===============================================================
# 3. DATE FEATURE ENGINEERING
# ===============================================================
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])

df["Length of Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days
df["Admission Month"] = df["Date of Admission"].dt.month
df["Admission Day"] = df["Date of Admission"].dt.day
df["Admission Weekday"] = df["Date of Admission"].dt.weekday
df["Discharge Month"] = df["Discharge Date"].dt.month
df["Discharge Weekday"] = df["Discharge Date"].dt.weekday

print("✅ Date features added.")


# ===============================================================
# 4. DROP RAW TEXT COLUMNS
# ===============================================================
drop_cols = [
    "Name", "Doctor", "Hospital", "Medication",
    "Test Results", "Date of Admission", "Discharge Date"
]

df_model = df.drop(columns=[c for c in drop_cols if c in df.columns])
print("✅ Clean dataset:", df_model.shape)


# ===============================================================
# 5. IDENTIFY CATEGORICAL COLUMNS
# ===============================================================
cat_cols = df_model.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", cat_cols)


# ===============================================================
# 6. LABEL ENCODE LOW-CARDINALITY COLUMNS
# ===============================================================
label_encode_cols = [col for col in cat_cols if df_model[col].nunique() <= 10]
target_encode_cols = [col for col in cat_cols if col not in label_encode_cols]

le_map = {}  # To store label encoders
for col in label_encode_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_map[col] = le

print("✅ Label encoded:", label_encode_cols)


# ===============================================================
# 7. TARGET ENCODE HIGH-CARDINALITY COLUMNS
# ===============================================================
target_maps = {}

for col in target_encode_cols:
    means = df_model.groupby(col)["Billing Amount"].mean()
    df_model[col] = df_model[col].map(means)
    target_maps[col] = means

print("✅ Target encoded:", target_encode_cols)


# ===============================================================
# 8. FEATURES AND TARGET
# ===============================================================
y = df_model["Billing Amount"]
X = df_model.drop(columns=["Billing Amount"])

print("✅ Training dataset ready:", X.shape)


# ===============================================================
# 9. TRAIN MODEL
# ===============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.07,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)
print("✅ Model trained successfully!")


# ===============================================================
# 10. FIXED NEW-PATIENT PREDICTION FUNCTION
# ===============================================================
def predict_bill(
    age, gender, blood_type, medical_condition,
    admission_type, insurance_provider,
    admission_date, discharge_date, room_number
):
    ad = pd.to_datetime(admission_date)
    dd = pd.to_datetime(discharge_date)

    los = (dd - ad).days
    adm_month = ad.month
    adm_day = ad.day
    adm_weekday = ad.weekday()
    dis_month = dd.month
    dis_weekday = dd.weekday()

    patient = {
        "Age": age,
        "Length of Stay": los,
        "Admission Month": adm_month,
        "Admission Day": adm_day,
        "Admission Weekday": adm_weekday,
        "Discharge Month": dis_month,
        "Discharge Weekday": dis_weekday,
        "Room Number": room_number
    }

    # Process categorical columns safely
    for col, value in {
        "Gender": gender,
        "Blood Type": blood_type,
        "Admission Type": admission_type,
        "Medical Condition": medical_condition,
        "Insurance Provider": insurance_provider
    }.items():

        # If label encoded → use its encoder
        if col in label_encode_cols:
            patient[col] = le_map[col].transform([value])[0]

        # If target encoded → map using means
        elif col in target_encode_cols:
            patient[col] = target_maps[col].get(value, target_maps[col].mean())

        else:
            patient[col] = 0  # fallback

    new_df = pd.DataFrame([patient])
    new_df = new_df.reindex(columns=X.columns, fill_value=0)

    return model.predict(new_df)[0]


print("✅ Billing prediction system ready!")




✅ Data loaded: (54966, 15)
✅ Date features added.
✅ Clean dataset: (54966, 14)
Categorical columns: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']
✅ Label encoded: ['Gender', 'Blood Type', 'Medical Condition', 'Insurance Provider', 'Admission Type']
✅ Target encoded: []
✅ Training dataset ready: (54966, 13)
✅ Model trained successfully!
✅ Billing prediction system ready!

✅ Estimated Final Bill: 24382.758


In [10]:
# ===============================================================
# 11. TEST WITH SAMPLE NEW PATIENT
# ===============================================================
bill = predict_bill(
    age=30,
    gender="Male",
    blood_type="B-",
    medical_condition="Cancer",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2024-02-02",
    room_number=328
)

print("\n✅ Estimated Final Bill:", bill)



✅ Estimated Final Bill: 24382.758


In [21]:
# ===============================================================
# 11. TEST WITH SAMPLE NEW PATIENT
# ===============================================================
bill = predict_bill(
    age=30,
    gender="Male",
    blood_type="B-",
    medical_condition="Cancer",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-31",
    discharge_date="2026-09-28",
    room_number=328
)

print("\n✅ Estimated Final Bill:", bill)



✅ Estimated Final Bill: 30899.62


In [23]:
# ===============================================================
# 1. IMPORTS
# ===============================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# ===============================================================
# 2. LOAD DATA
# ===============================================================
df = pd.read_csv("healthcare_dataset_cleaned.csv")
print("✅ Data loaded:", df.shape)

# ===============================================================
# 3. DATE FEATURE ENGINEERING
# ===============================================================
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])

df["Length of Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days
df["Admission Month"] = df["Date of Admission"].dt.month
df["Admission Weekday"] = df["Date of Admission"].dt.weekday

print("✅ Date features added.")

# ===============================================================
# 4. SELECT MEDICALLY IMPORTANT FEATURES
# ===============================================================
df_model = df[[
    "Medical Condition",
    "Admission Type",
    "Insurance Provider",
    "Length of Stay",
    "Admission Month",
    "Admission Weekday",
    "Billing Amount"
]]

print("✅ Selected clean medical feature set:", df_model.shape)

# ===============================================================
# 5. IDENTIFY CATEGORICAL COLUMNS
# ===============================================================
cat_cols = ["Medical Condition", "Admission Type", "Insurance Provider"]

# Split into low/high cardinality
label_encode_cols = [col for col in cat_cols if df_model[col].nunique() <= 10]
target_encode_cols = [col for col in cat_cols if col not in label_encode_cols]

print("✅ Label Encode:", label_encode_cols)
print("✅ Target Encode:", target_encode_cols)

# ===============================================================
# 6. LABEL ENCODE LOW CARDINALITY CATEGORICAL FEATURES
# ===============================================================
le_map = {}

for col in label_encode_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_map[col] = le

print("✅ Label Encoding Completed")

# ===============================================================
# 7. TARGET ENCODE HIGH CARDINALITY CATEGORICAL FEATURES
# ===============================================================
target_maps = {}

for col in target_encode_cols:
    mean_map = df_model.groupby(col)["Billing Amount"].mean()
    df_model[col] = df_model[col].map(mean_map)
    target_maps[col] = mean_map

print("✅ Target Encoding Completed")

# ===============================================================
# 8. BUILD FEATURES AND TARGET
# ===============================================================
y = df_model["Billing Amount"]
X = df_model.drop(columns=["Billing Amount"])

print("✅ X shape:", X.shape)
print("✅ y shape:", y.shape)

# ===============================================================
# 9. TRAIN MODEL
# ===============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.07,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)
print("✅ Model trained successfully!")

# ===============================================================
# 10. FINAL PREDICTION FUNCTION (USES CLEAN FEATURES)
# ===============================================================
def predict_bill(
    medical_condition,
    admission_type,
    insurance_provider,
    admission_date,
    discharge_date
):
    ad = pd.to_datetime(admission_date)
    dd = pd.to_datetime(discharge_date)

    los = (dd - ad).days
    adm_month = ad.month
    adm_weekday = ad.weekday()

    patient = {
        "Length of Stay": los,
        "Admission Month": adm_month,
        "Admission Weekday": adm_weekday
    }

    # Encode categorical columns
    for col, val in {
        "Medical Condition": medical_condition,
        "Admission Type": admission_type,
        "Insurance Provider": insurance_provider
    }.items():
        if col in label_encode_cols:
            patient[col] = le_map[col].transform([val])[0]
        else:
            patient[col] = target_maps[col].get(val, target_maps[col].mean())

    new_df = pd.DataFrame([patient])
    new_df = new_df.reindex(columns=X.columns, fill_value=0)

    return model.predict(new_df)[0]

print("✅ Billing prediction system ready!")


✅ Data loaded: (54966, 15)
✅ Date features added.
✅ Selected clean medical feature set: (54966, 7)
✅ Label Encode: ['Medical Condition', 'Admission Type', 'Insurance Provider']
✅ Target Encode: []
✅ Label Encoding Completed
✅ Target Encoding Completed
✅ X shape: (54966, 6)
✅ y shape: (54966,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model[col] = le.fit_transform(df_model[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model[col] = le.fit_transform(df_model[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model[col] = le.fit_transform(df_model[col])


✅ Model trained successfully!
✅ Billing prediction system ready!


In [27]:
bill = predict_bill(
    medical_condition="Cancer",
    admission_type="Urgent",
    insurance_provider="Blue Cross",
    admission_date="2024-01-01",
    discharge_date="2026-01-10"
)

print("✅ Predicted Bill:", bill)


✅ Predicted Bill: 23355.072
