Cost-of-Treatment Prediction

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt


In [14]:
df = pd.read_csv("../data/inpatientCharges.csv")

df.shape


(163065, 12)

In [15]:
df.describe()


Unnamed: 0,Provider Id,Provider Zip Code,Total Discharges
count,163065.0,163065.0,163065.0
mean,255569.865428,47938.121908,42.776304
std,151563.671767,27854.32308,51.104042
min,10001.0,1040.0,11.0
25%,110092.0,27261.0,17.0
50%,250007.0,44309.0,27.0
75%,380075.0,72901.0,49.0
max,670077.0,99835.0,3383.0


In [16]:
df.columns = df.columns.str.strip()


In [17]:
df = df.drop(columns=[
    "Provider Id",
    "Provider Name",
    "Provider Street Address",
    "Provider Zip Code"
])

df.head()


Unnamed: 0,DRG Definition,Provider City,Provider State,Hospital Referral Region Description,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
0,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,DOTHAN,AL,AL - Dothan,91,$32963.07,$5777.24,$4763.73
1,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,BOAZ,AL,AL - Birmingham,14,$15131.85,$5787.57,$4976.71
2,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,FLORENCE,AL,AL - Birmingham,24,$37560.37,$5434.95,$4453.79
3,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,BIRMINGHAM,AL,AL - Birmingham,25,$13998.28,$5417.56,$4129.16
4,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,ALABASTER,AL,AL - Birmingham,18,$31633.27,$5658.33,$4851.44


In [18]:
cost_columns = [
    "Average Covered Charges",
    "Average Total Payments",
    "Average Medicare Payments"
]

for col in cost_columns:
    df[col] = df[col].replace('[\$,]', '', regex=True).astype(float)

df[cost_columns].head()


Unnamed: 0,Average Covered Charges,Average Total Payments,Average Medicare Payments
0,32963.07,5777.24,4763.73
1,15131.85,5787.57,4976.71
2,37560.37,5434.95,4453.79
3,13998.28,5417.56,4129.16
4,31633.27,5658.33,4851.44


In [19]:
df.isnull().sum()


DRG Definition                          0
Provider City                           0
Provider State                          0
Hospital Referral Region Description    0
Total Discharges                        0
Average Covered Charges                 0
Average Total Payments                  0
Average Medicare Payments               0
dtype: int64

In [20]:
X = df.drop("Average Total Payments", axis=1)
y = df["Average Total Payments"]


In [21]:
categorical_cols = [
    "DRG Definition",
    "Provider State",
    "Hospital Referral Region Description"
]

numerical_cols = [
    "Total Discharges",
    "Average Covered Charges",
    "Average Medicare Payments"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [23]:
lr_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
rmse_lr


np.float64(1072.6299030953091)

In [24]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rmse_rf


np.float64(1071.5185641658386)

In [25]:
gb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor(random_state=42))
])

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
rmse_gb


np.float64(1054.8736261840554)

In [26]:
with open("../results/metrics.txt", "w") as f:
    f.write(f"Linear Regression RMSE: {rmse_lr}\n")
    f.write(f"Random Forest RMSE: {rmse_rf}\n")
    f.write(f"Gradient Boosting RMSE: {rmse_gb}\n")


In [27]:
import joblib
joblib.dump(gb_model, "../backend/model.pkl")


['../backend/model.pkl']