In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import graph_utils

from sklearn import metrics
from catboost import CatBoostRegressor

In [2]:
baseline_frame = pd.read_csv("../raw_data/edstay_encounters.csv")

In [3]:
train_columns = [
    "temperature",
    "heartrate",
    "o2sat",
    "sbp",
    "dbp",
    "resprate",
    "pain",
    "acuity",
    "age_on_adm",
    "gender",
    "arrival_transport",
]
feature_column = "los"

In [None]:
filtered_frame = baseline_frame[train_columns + [feature_column]]

In [5]:
category_cols = ["pain", "acuity", "gender", "arrival_transport"]

# cast category columns to string
filtered_frame[category_cols] = filtered_frame[category_cols].apply(
    lambda col: col.astype(str)
)

# replace nan in category columns with 'missing'
filtered_frame[category_cols] = filtered_frame[category_cols].fillna("missing")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame[category_cols] = filtered_frame[category_cols].apply(lambda col: col.astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame[category_cols] = filtered_frame[category_cols].fillna('missing')


In [6]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    filtered_frame[train_columns],
    filtered_frame[feature_column],
    test_size=0.2,
    random_state=42,
)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

In [7]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function="RMSE",
    cat_features=category_cols,
    task_type="GPU",
    early_stopping_rounds=100,
)

In [8]:
model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100)

0:	learn: 396.6171888	test: 395.8401095	best: 395.8401095 (0)	total: 18.2ms	remaining: 18.2s
100:	learn: 380.3216984	test: 380.5065554	best: 380.5065554 (100)	total: 2.03s	remaining: 18.1s
200:	learn: 378.9799633	test: 380.2869882	best: 380.2843278 (193)	total: 4.13s	remaining: 16.4s
300:	learn: 378.0198307	test: 380.3159830	best: 380.2603827 (209)	total: 6.24s	remaining: 14.5s
bestTest = 380.2603827
bestIteration = 209
Shrink model to first 210 iterations.


<catboost.core.CatBoostRegressor at 0x7fcd08d67400>

In [9]:
y_pred = model.predict(X_test)

In [10]:
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
mae = metrics.mean_absolute_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)



In [11]:
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R^2: {r2}")

RMSE: 366.8117461787408
MAE: 223.86326373801091
R^2: 0.08741707005603339


In [None]:
graph_utils.plot_feature_importances(model, "catboost baseline")

In [None]:
graph_utils.plot_regression_results(y_test, y_pred, "catboost baseline")