In [1]:
import pandas as pd
import numpy as np
import graph_utils as gu

from sklearn.model_selection import train_test_split
from sklearn import metrics
from catboost import CatBoostRegressor

In [2]:
edstays_frame = pd.read_csv("../raw_data/edstay_encounters.csv")

In [3]:
train_columns = [
    "temperature",
    "heartrate",
    "o2sat",
    "sbp",
    "dbp",
    "resprate",
    "pain",
    "acuity",
    "age_on_adm",
    "gender",
    "arrival_transport",
    "chiefcomplaint",
]
feature_column = "los"

In [4]:
filtered_frame = edstays_frame[train_columns + [feature_column]]

In [5]:
category_cols = ["pain", "acuity", "gender", "arrival_transport"]

# cast category columns to string
filtered_frame[category_cols] = filtered_frame[category_cols].apply(
    lambda col: col.astype(str)
)

# replace nan in category columns with 'missing'
filtered_frame[category_cols] = filtered_frame[category_cols].fillna("missing")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame[category_cols] = filtered_frame[category_cols].apply(lambda col: col.astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame[category_cols] = filtered_frame[category_cols].fillna('missing')


In [6]:
# cast chiefcomplaint to string
filtered_frame["chiefcomplaint"] = filtered_frame["chiefcomplaint"].astype(str)

# replace nan in chiefcomplaint with 'missing'
filtered_frame["chiefcomplaint"] = filtered_frame["chiefcomplaint"].fillna("missing")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame['chiefcomplaint'] = filtered_frame['chiefcomplaint'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame['chiefcomplaint'] = filtered_frame['chiefcomplaint'].fillna('missing')


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    filtered_frame[train_columns],
    filtered_frame[feature_column],
    test_size=0.2,
    random_state=42,
)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

In [8]:
model = CatBoostRegressor(
    iterations=10000,
    early_stopping_rounds=100,
    cat_features=category_cols,
    text_features=["chiefcomplaint"],
    verbose=100,
    task_type="GPU",
    devices="0:1",
)

In [9]:
model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

Learning rate set to 0.060467
0:	learn: 397.0406924	test: 396.2829045	best: 396.2829045 (0)	total: 71.8ms	remaining: 11m 57s
100:	learn: 372.2676325	test: 372.8326863	best: 372.8326863 (100)	total: 1.88s	remaining: 3m 3s
200:	learn: 368.5372409	test: 371.4693936	best: 371.4693936 (200)	total: 3.44s	remaining: 2m 47s
300:	learn: 365.8630884	test: 371.4810174	best: 371.2755450 (214)	total: 4.87s	remaining: 2m 36s
bestTest = 371.275545
bestIteration = 214
Shrink model to first 215 iterations.


<catboost.core.CatBoostRegressor at 0x7f1b9686b640>

In [10]:
y_pred = model.predict(X_test)

In [11]:
rmse = metrics.root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")
mae = metrics.mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
r2 = metrics.r2_score(y_test, y_pred)
print(f"R2: {r2}")

RMSE: 357.015508891987
MAE: 217.03941007917098
R2: 0.13550987422809113


In [None]:
gu.plot_feature_importances(model, "Catboost NLP Techniqes")

In [None]:
gu.plot_regression_results(y_test, y_pred, "Catboost NLP Techniques")

In [None]:
gu.plot_residuals(y_test, y_pred, "Catboost NLP Techniques")