In [38]:
import pandas as pd
import numpy as np
import graph_utils as gu

from sklearn.model_selection import train_test_split
from sklearn import metrics
from catboost import CatBoostRegressor

In [39]:
lda_cluster_frame = pd.read_csv("../processed_data/lda_frame.csv")
sentence_t_frame = pd.read_csv(
    "../processed_data/sentence_transformer_clustered_cc.csv"
)
word2vec = pd.read_csv("../processed_data/word2vec_frame_with_cluster.csv")
edstays = pd.read_csv("../raw_data/edstay_encounters.csv")

In [40]:
lda_edstays = edstays.copy()
sentence_t_edstays = edstays.copy()
w2v_edstays = edstays.copy()

lda_edstays = pd.merge(
    lda_edstays,
    lda_cluster_frame[["chiefcomplaint", "cluster_100", "cluster_1000"]],
    on="chiefcomplaint",
    how="left",
)
sentence_t_edstays = pd.merge(
    sentence_t_edstays,
    sentence_t_frame[["chiefcomplaint", "cluster"]],
    on="chiefcomplaint",
    how="left",
)
w2v_edstays = pd.merge(
    w2v_edstays,
    word2vec[["chiefcomplaint", "cluster"]],
    on="chiefcomplaint",
    how="left",
)

In [41]:
train_columns = [
    "temperature",
    "heartrate",
    "o2sat",
    "sbp",
    "dbp",
    "resprate",
    "pain",
    "acuity",
    "age_on_adm",
    "gender",
    "arrival_transport",
    "cluster",
]
train_columns_lda = [
    "temperature",
    "heartrate",
    "o2sat",
    "sbp",
    "dbp",
    "resprate",
    "pain",
    "acuity",
    "age_on_adm",
    "gender",
    "arrival_transport",
    "cluster_100",
    "cluster_1000",
]
feature_column = "los"

In [42]:
filtered_lda_edstays = lda_edstays[train_columns_lda + [feature_column]]
filtered_sentence_t_edstays = sentence_t_edstays[train_columns + [feature_column]]
filtered_w2v_edstays = w2v_edstays[train_columns + [feature_column]]

In [43]:
category_cols = ["pain", "acuity", "gender", "arrival_transport", "cluster"]
category_cols_lda = [
    "pain",
    "acuity",
    "gender",
    "arrival_transport",
    "cluster_100",
    "cluster_1000",
]
category_cols_lda_100 = ["pain", "acuity", "gender", "arrival_transport", "cluster_100"]
category_cols_lda_1000 = [
    "pain",
    "acuity",
    "gender",
    "arrival_transport",
    "cluster_1000",
]


# cast category columns to string
filtered_lda_edstays[category_cols_lda] = filtered_lda_edstays[category_cols_lda].apply(
    lambda col: col.astype(str)
)
filtered_lda_edstays[category_cols_lda] = filtered_lda_edstays[category_cols_lda].apply(
    lambda col: col.astype(str)
)
# replace nan in category columns with 'missing'
filtered_lda_edstays[category_cols_lda] = filtered_lda_edstays[
    category_cols_lda
].fillna("missing")
filtered_lda_edstays[category_cols_lda] = filtered_lda_edstays[
    category_cols_lda
].fillna("missing")

filtered_sentence_t_edstays[category_cols] = filtered_sentence_t_edstays[
    category_cols
].apply(lambda col: col.astype(str))

# replace nan in category columns with 'missing'
filtered_sentence_t_edstays[category_cols] = filtered_sentence_t_edstays[
    category_cols
].fillna("missing")

filtered_w2v_edstays[category_cols] = filtered_w2v_edstays[category_cols].apply(
    lambda col: col.astype(str)
)

# replace nan in category columns with 'missing'
filtered_w2v_edstays[category_cols] = filtered_w2v_edstays[category_cols].fillna(
    "missing"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_lda_edstays[category_cols_lda] = filtered_lda_edstays[category_cols_lda].apply(lambda col: col.astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_lda_edstays[category_cols_lda] = filtered_lda_edstays[category_cols_lda].apply(lambda col: col.astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

In [44]:
X_lda_train, X_lda_test, y_lda_train, y_lda_test = train_test_split(
    filtered_lda_edstays[train_columns_lda].drop(columns=["cluster_1000"]),
    filtered_lda_edstays[feature_column],
    test_size=0.2,
    random_state=42,
)
X_lda_1000_train, X_lda_1000_test, y_lda_1000_train, y_lda_1000_test = train_test_split(
    filtered_lda_edstays[train_columns_lda].drop(columns=["cluster_100"]),
    filtered_lda_edstays[feature_column],
    test_size=0.2,
    random_state=42,
)
X_sentence_t_train, X_sentence_t_test, y_sentence_t_train, y_sentence_t_test = (
    train_test_split(
        filtered_sentence_t_edstays[train_columns],
        filtered_sentence_t_edstays[feature_column],
        test_size=0.2,
        random_state=42,
    )
)
X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(
    filtered_w2v_edstays[train_columns],
    filtered_w2v_edstays[feature_column],
    test_size=0.2,
    random_state=42,
)

X_lda_test, X_lda_val, y_lda_test, y_lda_val = train_test_split(
    X_lda_test, y_lda_test, test_size=0.5, random_state=42
)
X_lda_1000_test, X_lda_1000_val, y_lda_1000_test, y_lda_1000_val = train_test_split(
    X_lda_1000_test, y_lda_1000_test, test_size=0.5, random_state=42
)
X_sentence_t_test, X_sentence_t_val, y_sentence_t_test, y_sentence_t_val = (
    train_test_split(
        X_sentence_t_test, y_sentence_t_test, test_size=0.5, random_state=42
    )
)
X_w2v_test, X_w2v_val, y_w2v_test, y_w2v_val = train_test_split(
    X_w2v_test, y_w2v_test, test_size=0.5, random_state=42
)

In [45]:
lda_model = CatBoostRegressor(
    iterations=10000,
    loss_function="RMSE",
    cat_features=category_cols_lda_100,
    task_type="GPU",
    early_stopping_rounds=100,
)

In [46]:
lda_model.fit(
    X_lda_train,
    y_lda_train,
    eval_set=(X_lda_val, y_lda_val),
    verbose=100,
    use_best_model=True,
)

Learning rate set to 0.060467
0:	learn: 397.2575354	test: 396.4883735	best: 396.4883735 (0)	total: 22.4ms	remaining: 3m 43s
100:	learn: 376.1610770	test: 375.2174037	best: 375.2174037 (100)	total: 2.12s	remaining: 3m 27s
200:	learn: 374.7686470	test: 374.1836411	best: 374.1836411 (200)	total: 4.21s	remaining: 3m 25s
300:	learn: 373.5993005	test: 373.5808233	best: 373.5808233 (300)	total: 6.33s	remaining: 3m 23s
400:	learn: 372.6971686	test: 373.3073866	best: 373.3066283 (399)	total: 8.52s	remaining: 3m 23s
500:	learn: 372.0334887	test: 373.0305206	best: 373.0289707 (498)	total: 10.7s	remaining: 3m 22s
600:	learn: 371.3754984	test: 372.9509699	best: 372.9508085 (585)	total: 12.8s	remaining: 3m 20s
700:	learn: 370.6551563	test: 372.7106004	best: 372.7106004 (700)	total: 15s	remaining: 3m 19s
800:	learn: 369.8327421	test: 372.6250340	best: 372.6250340 (800)	total: 17.2s	remaining: 3m 17s
900:	learn: 369.2632802	test: 372.5360209	best: 372.5349701 (874)	total: 19.4s	remaining: 3m 16s
1000:

<catboost.core.CatBoostRegressor at 0x7efd3b59df30>

In [47]:
y_lda_pred = lda_model.predict(X_lda_test)

In [48]:
rmse = metrics.root_mean_squared_error(y_lda_test, y_lda_pred)
print(f"RMSE: {rmse}")
mae = metrics.mean_absolute_error(y_lda_test, y_lda_pred)
print(f"MAE: {mae}")
r2 = metrics.r2_score(y_lda_test, y_lda_pred)
print(f"R2: {r2}")

RMSE: 359.697138903659
MAE: 217.31330122453272
R2: 0.12247431106985907


In [49]:
lda_1000_model = CatBoostRegressor(
    iterations=10000,
    loss_function="RMSE",
    cat_features=category_cols_lda_1000,
    task_type="GPU",
    early_stopping_rounds=100,
)

In [50]:
lda_1000_model.fit(
    X_lda_1000_train,
    y_lda_1000_train,
    eval_set=(X_lda_1000_val, y_lda_1000_val),
    verbose=100,
    use_best_model=True,
)

Learning rate set to 0.060467
0:	learn: 397.6286687	test: 396.8653822	best: 396.8653822 (0)	total: 18.5ms	remaining: 3m 4s
100:	learn: 381.0748619	test: 380.7832047	best: 380.7832047 (100)	total: 1.99s	remaining: 3m 15s
200:	learn: 379.9314878	test: 380.3815179	best: 380.3815179 (200)	total: 4.23s	remaining: 3m 26s
300:	learn: 379.1267503	test: 380.1987077	best: 380.1962842 (295)	total: 6.33s	remaining: 3m 23s
400:	learn: 378.0041541	test: 380.1587419	best: 380.1254101 (383)	total: 8.37s	remaining: 3m 20s
bestTest = 380.1254101
bestIteration = 383
Shrink model to first 384 iterations.


<catboost.core.CatBoostRegressor at 0x7efd27d6eb30>

In [51]:
y_lda_1000_pred = lda_1000_model.predict(X_lda_1000_test)

In [52]:
rmse = metrics.root_mean_squared_error(y_lda_1000_test, y_lda_1000_pred)
print(f"RMSE: {rmse}")
mae = metrics.mean_absolute_error(y_lda_1000_test, y_lda_1000_pred)
print(f"MAE: {mae}")
r2 = metrics.r2_score(y_lda_1000_test, y_lda_1000_pred)
print(f"R2: {r2}")

RMSE: 366.850306123096
MAE: 223.98972043792276
R2: 0.08722519508507298


In [53]:
sentence_t_model = CatBoostRegressor(
    iterations=10000,
    loss_function="RMSE",
    cat_features=category_cols,
    task_type="GPU",
    early_stopping_rounds=100,
)

In [54]:
sentence_t_model.fit(
    X_sentence_t_train,
    y_sentence_t_train,
    eval_set=(X_sentence_t_val, y_sentence_t_val),
    verbose=100,
    use_best_model=True,
)

Learning rate set to 0.060467
0:	learn: 396.9747214	test: 396.1413661	best: 396.1413661 (0)	total: 18.8ms	remaining: 3m 8s
100:	learn: 373.3880405	test: 371.6725714	best: 371.6725714 (100)	total: 2.08s	remaining: 3m 23s
200:	learn: 371.8375030	test: 370.8461362	best: 370.8461362 (200)	total: 4.28s	remaining: 3m 28s
300:	learn: 370.7336571	test: 370.4351697	best: 370.4351697 (300)	total: 6.41s	remaining: 3m 26s
400:	learn: 369.4116785	test: 370.3041945	best: 370.2883540 (382)	total: 8.61s	remaining: 3m 26s
500:	learn: 368.5788430	test: 370.0731229	best: 370.0719837 (498)	total: 10.9s	remaining: 3m 26s
600:	learn: 367.8936891	test: 369.9159723	best: 369.9120487 (597)	total: 13.1s	remaining: 3m 25s
700:	learn: 367.1936606	test: 369.8415323	best: 369.8412718 (689)	total: 15.4s	remaining: 3m 23s
800:	learn: 366.5782916	test: 369.8235388	best: 369.8167157 (757)	total: 17.6s	remaining: 3m 21s
bestTest = 369.8167157
bestIteration = 757
Shrink model to first 758 iterations.


<catboost.core.CatBoostRegressor at 0x7efd2774b4f0>

In [55]:
y_sentence_t_pred = sentence_t_model.predict(X_sentence_t_test)

In [56]:
rmse = metrics.root_mean_squared_error(y_sentence_t_test, y_sentence_t_pred)
print(f"RMSE: {rmse}")
mae = metrics.mean_absolute_error(y_sentence_t_test, y_sentence_t_pred)
print(f"MAE: {mae}")
r2 = metrics.r2_score(y_sentence_t_test, y_sentence_t_pred)
print(f"R2: {r2}")

RMSE: 357.5547861954179
MAE: 215.54587416911997
R2: 0.1328962510479379


In [57]:
w2v_model = CatBoostRegressor(
    iterations=10000,
    loss_function="RMSE",
    cat_features=category_cols,
    task_type="GPU",
    early_stopping_rounds=100,
)

In [58]:
w2v_model.fit(
    X_w2v_train,
    y_w2v_train,
    eval_set=(X_w2v_val, y_w2v_val),
    verbose=100,
    use_best_model=True,
)

Learning rate set to 0.060467
0:	learn: 397.1169649	test: 396.3389010	best: 396.3389010 (0)	total: 17.6ms	remaining: 2m 55s
100:	learn: 374.0290784	test: 373.4730271	best: 373.4730271 (100)	total: 2.14s	remaining: 3m 29s
200:	learn: 372.5750372	test: 372.4730831	best: 372.4730831 (200)	total: 4.3s	remaining: 3m 29s
300:	learn: 371.5086421	test: 372.1461946	best: 372.1461946 (300)	total: 6.5s	remaining: 3m 29s
400:	learn: 370.4187901	test: 371.8171399	best: 371.8143864 (394)	total: 8.71s	remaining: 3m 28s
500:	learn: 369.7133947	test: 371.5847077	best: 371.5834111 (499)	total: 11.1s	remaining: 3m 30s
600:	learn: 369.0973807	test: 371.4715335	best: 371.4715335 (600)	total: 13.4s	remaining: 3m 30s
700:	learn: 368.2658296	test: 371.3800873	best: 371.3580006 (698)	total: 15.7s	remaining: 3m 28s
800:	learn: 367.4903479	test: 371.2965823	best: 371.2965823 (800)	total: 18.4s	remaining: 3m 31s
900:	learn: 367.0294837	test: 371.2400692	best: 371.2373114 (889)	total: 20.8s	remaining: 3m 29s
1000:

<catboost.core.CatBoostRegressor at 0x7efd1c466020>

In [59]:
y_w2v_pred = w2v_model.predict(X_w2v_test)

In [60]:
rmse = metrics.root_mean_squared_error(y_w2v_test, y_w2v_pred)
print(f"RMSE: {rmse}")
mae = metrics.mean_absolute_error(y_w2v_test, y_w2v_pred)
print(f"MAE: {mae}")
r2 = metrics.r2_score(y_w2v_test, y_w2v_pred)
print(f"R2: {r2}")

RMSE: 357.42932818623143
MAE: 214.4905104153771
R2: 0.13350463907217947


In [None]:
gu.plot_feature_importances(lda_model, "LDA")

In [None]:
gu.plot_feature_importances(sentence_t_model, "Sentence Transformer")

In [None]:
gu.plot_feature_importances(w2v_model, "Word2Vec")

In [None]:
gu.plot_regression_results(y_lda_test, y_lda_pred, "LDA")

In [None]:
gu.plot_regression_results(y_sentence_t_test, y_sentence_t_pred, "Sentence Transformer")

In [None]:
gu.plot_regression_results(y_w2v_test, y_w2v_pred, "Word2Vec")

In [None]:
gu.plot_residuals(y_lda_test, y_lda_pred, "LDA")

In [None]:
gu.plot_residuals(y_sentence_t_test, y_sentence_t_pred, "Sentence Transformer")

In [None]:
gu.plot_residuals(y_w2v_test, y_w2v_pred, "Word2Vec")