In [75]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import graph_utils

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [76]:
llama_full = pd.read_csv("../processed_data/llama31_full.csv")

edstays = pd.read_csv("../raw_data/edstay_encounters.csv")


Columns (23) have mixed types. Specify dtype option on import or set low_memory=False.



In [77]:
llama_full["los"] = edstays["los"]

In [78]:
llama_full = llama_full.dropna(subset=["llm_info"])
llama_full.shape

(49791, 24)

In [79]:
llama_full.columns

Index(['subject_id', 'stay_id', 'temperature', 'heartrate', 'resprate',
       'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'chiefcomplaint', 'anchor_age',
       'anchor_year', 'anchor_year_group', 'gender', 'age_on_adm', 'stay_id-2',
       'arrival_transport', 'disposition', 'gender-2', 'intime', 'outtime',
       'los', 'llm_info'],
      dtype='object')

In [80]:
train_columns = [
    "temperature",
    "heartrate",
    "o2sat",
    "sbp",
    "dbp",
    "resprate",
    "pain",
    "acuity",
    "age_on_adm",
    "gender",
    "arrival_transport",
    "llm_info",
]
feature_column = "los"

In [81]:
llama_train = llama_full[train_columns + [feature_column]]

In [82]:
category_cols = ["pain", "acuity", "gender", "arrival_transport"]

# cast category columns to string
llama_train[category_cols] = llama_train[category_cols].apply(
    lambda col: col.astype(str)
)

# replace nan in category columns with 'missing'
llama_train[category_cols] = llama_train[category_cols].fillna("missing")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [83]:
X_llama = llama_train.drop(columns=[feature_column])
y_llama = llama_train[feature_column]

In [84]:
X_llama_train, X_llama_test, y_llama_train, y_llama_test = train_test_split(
    X_llama, y_llama, test_size=0.2, random_state=42
)
X_llama_test, X_llama_val, y_llama_test, y_llama_val = train_test_split(
    X_llama_test, y_llama_test, test_size=0.5, random_state=42
)

In [85]:
res_dict = {"rmse": [], "mae": [], "r2": []}

In [86]:
for i in range(3):
    model = CatBoostRegressor(
        iterations=10000,
        loss_function="RMSE",
        verbose=100,
        early_stopping_rounds=100,
        cat_features=category_cols,
        text_features=["llm_info"],
        task_type="GPU",
    )

    model.fit(X_llama_train, y_llama_train, eval_set=(X_llama_val, y_llama_val))

    y_pred = model.predict(X_llama_test)

    llama_test = X_llama_test.copy()
    llama_test["los"] = y_llama_test
    llama_test["pred_los"] = y_pred
    llama_test.to_csv(f"../predicted_data/llama31_preds_{i}.csv", index=False)

    rmse = np.sqrt(metrics.mean_squared_error(y_llama_test, y_pred))
    mae = metrics.mean_absolute_error(y_llama_test, y_pred)
    r2 = metrics.r2_score(y_llama_test, y_pred)

    res_dict["rmse"].append(rmse)
    res_dict["mae"].append(mae)
    res_dict["r2"].append(r2)

print("Average RMSE:", np.mean(res_dict["rmse"]))
print("Average MAE:", np.mean(res_dict["mae"]))
print("Average R2:", np.mean(res_dict["r2"]))

Learning rate set to 0.047966
0:	learn: 392.6785754	test: 422.7721556	best: 422.7721556 (0)	total: 39.3ms	remaining: 6m 32s
100:	learn: 371.9822951	test: 408.1025464	best: 408.1025464 (100)	total: 3.54s	remaining: 5m 47s
200:	learn: 367.2086289	test: 407.2719546	best: 407.2719546 (200)	total: 6.68s	remaining: 5m 25s
300:	learn: 362.2388972	test: 406.8708369	best: 406.8628456 (299)	total: 9.6s	remaining: 5m 9s
400:	learn: 357.8784286	test: 406.8173899	best: 406.8173899 (400)	total: 12.5s	remaining: 4m 59s
500:	learn: 353.5405497	test: 406.6263513	best: 406.6060446 (488)	total: 15.4s	remaining: 4m 52s
600:	learn: 349.5741971	test: 406.6846108	best: 406.5962307 (533)	total: 18.4s	remaining: 4m 47s
bestTest = 406.5962307
bestIteration = 533
Shrink model to first 534 iterations.
Learning rate set to 0.047966
0:	learn: 392.6785754	test: 422.7721556	best: 422.7721556 (0)	total: 41.3ms	remaining: 6m 53s
100:	learn: 371.9822951	test: 408.1025464	best: 408.1025464 (100)	total: 3.54s	remaining: 5

In [None]:
graph_utils.plot_feature_importances(model, "Llama-3.1 Generate Text")