In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

os.makedirs("../models", exist_ok=True)

In [5]:
# import data
df = pd.read_csv("../data/cleaned_imputed_labeled.csv")
df.drop(columns=["SepsisLabel", "ICULOS"], inplace=True)


# count NaNs
nan_counts = df.isna().sum().sum()
print("Number of NaNs in the DataFrame:", nan_counts)

df

Number of NaNs in the DataFrame: 0


Unnamed: 0,PatientID,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,PTT,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,NewLabel
0,p000001,97.0,95.0,36.11,98.0,75.33,63.830556,19.0,32.957657,24.000000,...,41.231193,5.7,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,0.0
1,p000001,97.0,95.0,36.11,98.0,75.33,63.830556,19.0,32.957657,24.000000,...,41.231193,5.7,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,0.0
2,p000001,89.0,99.0,36.11,122.0,86.00,63.830556,22.0,32.957657,24.000000,...,41.231193,5.7,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,0.0
3,p000001,90.0,95.0,36.11,122.0,86.00,63.830556,30.0,32.957657,24.000000,...,41.231193,5.7,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,0.0
4,p000001,103.0,88.5,36.11,122.0,91.33,63.830556,24.5,32.957657,24.000000,...,41.231193,5.7,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552205,p120000,80.0,96.0,36.40,115.0,87.00,65.000000,15.0,32.957657,-0.689919,...,29.100000,5.4,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,0.0
1552206,p120000,74.0,97.0,36.40,114.0,83.00,67.000000,15.0,32.957657,-0.689919,...,29.100000,5.4,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,0.0
1552207,p120000,78.0,98.0,36.40,110.0,83.00,69.000000,15.0,32.957657,-0.689919,...,29.100000,5.4,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,0.0
1552208,p120000,82.0,99.0,36.60,124.0,91.00,71.000000,16.0,32.957657,-0.689919,...,29.100000,5.4,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,0.0


In [33]:
# group patients by "PatientID" and take 2000 random patients
patient_ids = df["PatientID"].unique()
random_patient_ids = np.random.choice(patient_ids, size=4000, replace=False)

# filter the original dataframe
df_small = df[df["PatientID"].isin(random_patient_ids)]

# do gssplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
y = df_small["NewLabel"]
X = df_small.drop(columns=["NewLabel", "Unit1", "Unit2", "HospAdmTime"])
train_idx, test_idx = next(gss.split(X, y, groups=df_small["PatientID"]))
x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# balance out NewLabel in all datasets
# reduce number of 0s to be equal to number of 1s
num_ones = y_train[y_train == 1].shape[0]

x_train = x_train.drop(columns=["PatientID"])
x_test = x_test.drop(columns=["PatientID"])

# make x_test, y_test have equal number of 0s and 1s
x_test_0 = x_test[y_test == 0].sample(n=num_ones, random_state=42)
y_test_0 = y_test[y_test == 0].sample(n=num_ones, random_state=42)
x_test_1 = x_test[y_test == 1]
y_test_1 = y_test[y_test == 1]
x_test = pd.concat([x_test_0, x_test_1])
y_test = pd.concat([y_test_0, y_test_1])

# print column names in X
print(X.columns)

# random forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# out of all NewLabel 1 in y_test, what percent are in the top 20% of y_pred
threshold = np.percentile(y_pred, 80)
y_test_1 = y_test[y_test == 1]
y_pred_1 = y_pred[y_test == 1]
y_pred_1_top20 = y_pred_1[y_pred_1 >= threshold]
print("Percent of NewLabel 1 in top 20% of y_pred:", len(y_pred_1_top20) / len(y_test_1))


Index(['PatientID', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp',
       'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST',
       'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
       'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate',
       'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender'],
      dtype='object')
MAE: 0.25517895826787856
MSE: 0.21026500876301765
R2: -0.14801887008792414
Percent of NewLabel 1 in top 20% of y_pred: 0.3105022831050228


In [34]:
# get type 1 error rate
y_test_binary = (y_test >= 0.5).astype(int)
percentile_50 = np.percentile(y_pred, 50)
y_pred_binary = (y_pred >= percentile_50).astype(int)
false_positives = np.sum((y_test_binary == 0) & (y_pred_binary == 1))
true_negatives = np.sum(y_test_binary == 0)
type_1_error_rate = false_positives / (false_positives + true_negatives)
print("Type 1 Error Rate:", type_1_error_rate) 

# get type 2 error rate
false_negatives = np.sum((y_test_binary == 1) & (y_pred_binary == 0))
true_positives = np.sum(y_test_binary == 1)
type_2_error_rate = false_negatives / (false_negatives + true_positives)
print("Type 2 Error Rate:", type_2_error_rate)

Type 1 Error Rate: 0.2955631399317406
Type 2 Error Rate: 0.19583843329253367


In [None]:
# get one decision tree and visualize it
estimator = rf.estimators_[0]
from sklearn.tree import export_graphviz
import graphviz
dot_data = export_graphviz(estimator, out_file=None, 
                           feature_names=X.columns,  
                           filled=True, rounded=True,  
                           special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("../models/random_forest_tree")


'../models/random_forest_tree.pdf'

In [37]:
# try again with balanced data
df_balanced = pd.read_csv('../data/balanced_data.csv')
df_balanced.drop(columns=["SepsisLabel", "ICULOS"], inplace=True)


# count NaNs
nan_counts = df_balanced.isna().sum().sum()
print("Number of NaNs in the DataFrame:", nan_counts)

# group patients by "PatientID" and gssplit
patient_ids = df_balanced["PatientID"].unique()
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
y = df_balanced["NewLabel"]
X = df_balanced.drop(columns=["NewLabel", "Unit1", "Unit2", "HospAdmTime"])
train_idx, test_idx = next(gss.split(X, y, groups=df_balanced["PatientID"]))
x_train, x_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

x_train = x_train.drop(columns=["PatientID"])
x_test = x_test.drop(columns=["PatientID"])

# print column names in X
print(X.columns)

# random forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

Number of NaNs in the DataFrame: 0
Index(['PatientID', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp',
       'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST',
       'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
       'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate',
       'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender'],
      dtype='object')


KeyboardInterrupt: 

In [None]:
# get type 1 error rate
y_test_binary = (y_test >= 0.5).astype(int)
percentile_50 = np.percentile(y_pred, 50)
y_pred_binary = (y_pred >= percentile_50).astype(int)
false_positives = np.sum((y_test_binary == 0) & (y_pred_binary == 1))
true_negatives = np.sum(y_test_binary == 0)
type_1_error_rate = false_positives / (false_positives + true_negatives)
print("Type 1 Error Rate:", type_1_error_rate)

# get type 2 error rate
false_negatives = np.sum((y_test_binary == 1) & (y_pred_binary == 0))
true_positives = np.sum(y_test_binary == 1)
type_2_error_rate = false_negatives / (false_negatives + true_positives)
print("Type 2 Error Rate:", type_2_error_rate)

Type 1 Error Rate: 0.298728323699422
Type 2 Error Rate: 0.27409599794106293


In [None]:
TARGET = "NewLabel"
y = df[TARGET].astype(float)
X = df.drop(columns=[TARGET])

# Keep numeric features only
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X = X[numeric_cols]

# Groups (per-patient) to avoid leakage
groups = df["PatientID"].to_numpy()
df.drop(columns=["PatientID"], inplace=True)

# --- Train/test split by patient ---
gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = groups[train_idx]

# --- Pipeline ---
pipe = Pipeline([
    ("rf", RandomForestRegressor(random_state=42, n_jobs=-1))
])

# --- Group K-fold CV for grid search ---
n_unique = np.unique(groups_train).size
n_splits = min(5, max(2, n_unique))  # at least 2, up to 5
cv = GroupKFold(n_splits=n_splits)

# --- Parameter grid (sane sizes; expand if you have more compute) ---
param_grid = {
    "rf__n_estimators": [200, 400, 800],
    "rf__max_depth": [None, 12, 24, 36],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["sqrt", 0.5, None],  # sqrt, half, or all features
    "rf__bootstrap": [True],
}

# Multiple metrics; refit by RMSE (lower is better → maximize its negative)
scoring = {
    "rmse": "neg_root_mean_squared_error",
    "mae": "neg_mean_absolute_error",
    "r2": "r2",
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit="rmse",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=False,
)

# --- Run grid search (group-aware) ---
grid.fit(X_train, y_train, groups=groups_train)

print(f"\nBest params ({n_splits}-fold CV):\n{grid.best_params_}")
print(f"Best CV RMSE: {-grid.best_score_:.4f}")

# --- Evaluate best model on the held-out test set ---
best_pipe = grid.best_estimator_
y_pred = best_pipe.predict(X_test)
test_mae  = mean_absolute_error(y_test, y_pred)
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_r2   = r2_score(y_test, y_pred)

print("\nHeld-out Test Metrics:")
print(f"MAE:  {test_mae:.4f}")
print(f"RMSE: {test_rmse:.4f}")
print(f"R2:   {test_r2:.4f}")

# --- Feature importances (map back to column names) ---
rf = best_pipe.named_steps["rf"]
importances = pd.Series(rf.feature_importances_, index=numeric_cols).sort_values(ascending=False)
print("\nTop 20 features:\n", importances.head(20))

# --- Save artifacts ---
results_df = pd.DataFrame(grid.cv_results_)
results_df.to_csv("../models/rf_gridsearch_results.csv", index=False)
importances.to_csv("../models/rf_best_feature_importances.csv", header=["importance"])
joblib.dump(best_pipe, "../models/rf_best_grid.pkl")

# --- (Optional) Save predictions for analysis ---
df_out = df.copy()
df_out["RF_Grid_Pred"] = best_pipe.predict(X)
df_out.to_csv("../data/cleaned_imputed_labeled_with_grid_preds.csv")

print("\nArtifacts written:")
print("  Best model:           ../models/rf_best_grid.pkl")
print("  CV results:           ../models/rf_gridsearch_results.csv")
print("  Feature importances:  ../models/rf_best_feature_importances.csv")
print("  Predictions:          ../data/cleaned_imputed_labeled_with_grid_preds.csv")

KeyboardInterrupt: 