In [110]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import graph_utils

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [111]:
mistral_full = pd.read_csv("../processed_data/mistral_full_data.csv")

edstays = pd.read_csv("../raw_data/edstay_encounters.csv")


Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.



In [112]:
mistral_full["los"] = edstays["los"]

In [113]:
mistral_full.shape

(425087, 14)

In [114]:
# drop rows where llm_info is null
mistral_full = mistral_full.dropna(subset=["llm_info"])

In [115]:
# keep 50000 rows for now
mistral_full = mistral_full[:50000]

# shuffle the data
# mistral_full = mistral_full.sample(frac=1).reset_index(drop=True)

In [116]:
train_columns = [
    "temperature",
    "heartrate",
    "o2sat",
    "sbp",
    "dbp",
    "resprate",
    "pain",
    "acuity",
    "age_on_adm",
    "gender",
    "arrival_transport",
    "llm_info",
]
feature_column = "los"

In [117]:
mistral_train = mistral_full[train_columns + [feature_column]]

In [118]:
category_cols = ["pain", "acuity", "gender", "arrival_transport"]

# cast category columns to string
mistral_train[category_cols] = mistral_train[category_cols].apply(
    lambda col: col.astype(str)
)

# replace nan in category columns with 'missing'
mistral_train[category_cols] = mistral_train[category_cols].fillna("missing")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [119]:
X_mistral = mistral_train.drop(columns=[feature_column])
y_mistral = mistral_train[feature_column]

In [120]:
X_mistral_train, X_mistral_test, y_mistral_train, y_mistral_test = train_test_split(
    X_mistral, y_mistral, test_size=0.2, random_state=42
)

X_mistral_test, X_mistral_val, y_mistral_test, y_mistral_val = train_test_split(
    X_mistral_test, y_mistral_test, test_size=0.5, random_state=42
)

In [121]:
# X_mistral_test.to_csv('../predicted_data/Xtest.csv', index=False)

In [122]:
res_dict = {"rmse": [], "mae": [], "r2": []}

In [123]:
# train and get predictions 3 times
for i in range(3):
    model = CatBoostRegressor(
        iterations=10000,
        loss_function="RMSE",
        verbose=100,
        early_stopping_rounds=100,
        cat_features=category_cols,
        text_features=["llm_info"],
        task_type="GPU",
    )

    model.fit(X_mistral_train, y_mistral_train, eval_set=(X_mistral_val, y_mistral_val))

    y_pred = model.predict(X_mistral_test)

    mistral_test = X_mistral_test.copy()
    mistral_test["los"] = y_mistral_test
    mistral_test["pred"] = y_pred
    mistral_test.to_csv(f"../predicted_data/mistralpreds{i}.csv", index=False)

    rmse = np.sqrt(metrics.mean_squared_error(y_mistral_test, y_pred))
    mae = metrics.mean_absolute_error(y_mistral_test, y_pred)
    r2 = metrics.r2_score(y_mistral_test, y_pred)

    res_dict["rmse"].append(rmse)
    res_dict["mae"].append(mae)
    res_dict["r2"].append(r2)


# print average results
print("Average RMSE: ", np.mean(res_dict["rmse"]))
print("Average MAE: ", np.mean(res_dict["mae"]))
print("Average R2: ", np.mean(res_dict["r2"]))

Learning rate set to 0.047988
0:	learn: 397.3498448	test: 381.6673819	best: 381.6673819 (0)	total: 41.3ms	remaining: 6m 53s
100:	learn: 371.9412943	test: 362.5027669	best: 362.5027669 (100)	total: 3.34s	remaining: 5m 27s
200:	learn: 364.8887611	test: 361.5632725	best: 361.5632725 (200)	total: 6.32s	remaining: 5m 7s
300:	learn: 357.7786578	test: 360.9602128	best: 360.8873686 (277)	total: 9.34s	remaining: 5m
400:	learn: 352.1447339	test: 360.8907558	best: 360.8301007 (379)	total: 12.2s	remaining: 4m 51s
500:	learn: 346.8183340	test: 360.6008852	best: 360.5895262 (499)	total: 15s	remaining: 4m 44s
600:	learn: 342.5603363	test: 360.3045467	best: 360.2515388 (565)	total: 17.8s	remaining: 4m 38s
bestTest = 360.2515388
bestIteration = 565
Shrink model to first 566 iterations.
Learning rate set to 0.047988
0:	learn: 397.3498448	test: 381.6673819	best: 381.6673819 (0)	total: 40.5ms	remaining: 6m 44s
100:	learn: 371.9412943	test: 362.5027669	best: 362.5027669 (100)	total: 3.3s	remaining: 5m 23s


In [None]:
graph_utils.plot_feature_importances(model, "Mistral Generate")

In [None]:
graph_utils.plot_regression_results(y_mistral_test, y_pred, "Mistral")

In [None]:
graph_utils.plot_residuals(y_mistral_test, y_pred, "Mistral")