In [64]:
# !pip install scikit-learn

In [65]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [66]:
train_X = pd.read_csv('./train_X.csv')
test_X = pd.read_csv('./test_X.csv')
train_y = pd.read_csv('./train_y.csv')
# print(train_X)

### ------------------------------------- Preprocess the data -------------------------------------

In [67]:
# 找出數值型和類別型欄位
numeric_cols = train_X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train_X.select_dtypes(include=['object']).columns

# 使用中位數填補數值型欄位的缺失值
num_imputer = SimpleImputer(strategy='median')
train_X[numeric_cols] = num_imputer.fit_transform(train_X[numeric_cols])
test_X[numeric_cols] = num_imputer.transform(test_X[numeric_cols])

# 使用出現頻率最高的類別填補類別型欄位的缺失值
cat_imputer = SimpleImputer(strategy='most_frequent')
train_X[categorical_cols] = cat_imputer.fit_transform(train_X[categorical_cols])
test_X[categorical_cols] = cat_imputer.transform(test_X[categorical_cols])

In [68]:
print(train_X)

       encounter_id  patient_id  hospital_id   age        bmi  \
0          126956.0    125763.0         26.0  75.0  23.147277   
1           18184.0     25399.0         54.0  42.0  35.071807   
2           51597.0      7974.0         81.0  39.0  27.680158   
3           40078.0     79625.0        161.0  62.0  42.070672   
4          130673.0     88261.0         29.0  82.0  27.680158   
...             ...         ...          ...   ...        ...   
44934       88819.0     81966.0         64.0  67.0  20.622289   
44935       45825.0      3470.0        161.0  63.0  35.054523   
44936       92992.0    109497.0        133.0  73.0  24.920113   
44937       29502.0     72376.0         54.0  69.0  27.202393   
44938       66936.0    130251.0        161.0  87.0  32.158934   

       elective_surgery         ethnicity gender  height  \
0                   0.0             Asian      M   163.0   
1                   1.0         Caucasian      F   157.5   
2                   0.0  African Americ

In [69]:
# 對類別型欄位進行 One-Hot Encoding
train_X_encoded = pd.get_dummies(train_X, columns=categorical_cols, drop_first=True)
test_X_encoded = pd.get_dummies(test_X, columns=categorical_cols, drop_first=True)

# 對齊訓練集和測試集的特徵，補齊缺少的欄位
train_X_encoded, test_X_encoded = train_X_encoded.align(test_X_encoded, join='left', axis=1, fill_value=0)

In [70]:
# print(train_X_encoded.head())
# print(test_X_encoded.head())
print(train_X_encoded)

       encounter_id  patient_id  hospital_id   age        bmi  \
0          126956.0    125763.0         26.0  75.0  23.147277   
1           18184.0     25399.0         54.0  42.0  35.071807   
2           51597.0      7974.0         81.0  39.0  27.680158   
3           40078.0     79625.0        161.0  62.0  42.070672   
4          130673.0     88261.0         29.0  82.0  27.680158   
...             ...         ...          ...   ...        ...   
44934       88819.0     81966.0         64.0  67.0  20.622289   
44935       45825.0      3470.0        161.0  63.0  35.054523   
44936       92992.0    109497.0        133.0  73.0  24.920113   
44937       29502.0     72376.0         54.0  69.0  27.202393   
44938       66936.0    130251.0        161.0  87.0  32.158934   

       elective_surgery  height  icu_id  pre_icu_los_days  weight  ...  \
0                   0.0   163.0   550.0          0.128472    61.5  ...   
1                   1.0   157.5   479.0          3.150694    87.0  ... 

### ------------------------------------- Train the model -------------------------------------

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, roc_auc_score, f1_score
import numpy as np

In [72]:
# 將目標變數轉換為一維格式
train_y = train_y['has_died'] if 'has_died' in train_y.columns else train_y

In [None]:
# 設定5折交叉驗證
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 初始化 Random Forest 模型
rf_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)

# 定義評分標準，計算 AUROC 和 F1 分數
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
f1_scorer = make_scorer(f1_score, average='macro')

# 計算 5 折交叉驗證的平均 AUROC 和 F1 分數
roc_auc_scores = cross_val_score(rf_model, train_X_encoded, train_y, cv=kf, scoring=roc_auc_scorer)
f1_scores = cross_val_score(rf_model, train_X_encoded, train_y, cv=kf, scoring=f1_scorer)

roc_auc_avg = np.mean(roc_auc_scores)
f1_avg = np.mean(f1_scores)





In [74]:
print("Average ROC AUC Score (5-fold CV):", roc_auc_avg)
print("Average F1 Score (5-fold CV):", f1_avg)

Average ROC AUC Score (5-fold CV): 0.8762270483756132
Average F1 Score (5-fold CV): 0.6633549221878716


In [75]:
# 用整個訓練集進行最終訓練
rf_model.fit(train_X_encoded, train_y)

# 對測試集進行預測
test_predictions = rf_model.predict(test_X_encoded)

In [76]:
# 將預測結果保存為 CSV 格式
submission = pd.DataFrame({
    'patient_id': test_X['patient_id'].astype(int),  # 假設測試集有 'encounter_id' 欄位作為唯一識別碼
    'has_died': test_predictions
})
submission.to_csv('./testing_result.csv', index=False)

print("Predictions saved to testing_result.csv")

Predictions saved to testing_result.csv
