In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('sph6004_assignment1_data.csv')
data['aki'] = data['aki'].apply(lambda x: 1 if x != 0 else 0)
data.head()

Unnamed: 0,id,aki,gender,admission_age,race,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,...,ggt_max,ld_ldh_min,ld_ldh_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,height,weight_admit
0,36570066,1,F,79.953141,BLACK/AFRICAN AMERICAN,96.0,104.0,100.083333,103.0,126.0,...,,236.0,318.0,15.0,6.0,5.0,4.0,0.0,157.0,110.0
1,39307659,0,F,78.194169,WHITE - RUSSIAN,72.0,134.0,97.263158,97.0,127.0,...,,,,15.0,6.0,5.0,4.0,0.0,,82.0
2,38743306,1,F,65.602396,WHITE,60.0,97.0,84.166667,95.0,143.0,...,,,,15.0,6.0,5.0,4.0,0.0,,62.1
3,32339865,1,F,64.906629,UNKNOWN,59.0,87.0,71.461538,113.0,150.0,...,,,,15.0,1.0,0.0,1.0,1.0,170.0,113.1
4,35526987,1,M,57.438861,WHITE,57.0,100.0,82.387097,81.0,127.0,...,,,,15.0,,0.0,1.0,1.0,178.0,97.4


In [3]:
missing_values = data.isnull().mean()
features_to_drop = missing_values[missing_values > 0.3].index
data.drop(features_to_drop, axis=1, inplace=True)
data.drop(columns=['id', 'race'], inplace=True)

num_samples, num_features = data.shape
print(num_samples)
print(num_features)

50920
63


In [4]:
from sklearn.impute import SimpleImputer

numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

imputer_num = SimpleImputer(strategy='median')
data[numerical_features] = imputer_num.fit_transform(data[numerical_features])

imputer_cat = SimpleImputer(strategy='most_frequent')
data[categorical_features] = imputer_cat.fit_transform(data[categorical_features])

data.isnull().mean().max()

0.0

In [5]:
data_encoded = pd.get_dummies(data, columns=categorical_features)
data_encoded.head()

Unnamed: 0,aki,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,dbp_max,...,ptt_min,ptt_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,weight_admit,gender_F,gender_M
0,1.0,79.953141,96.0,104.0,100.083333,103.0,126.0,116.136364,40.0,58.0,...,28.6,29.5,15.0,6.0,5.0,4.0,0.0,110.0,1,0
1,0.0,78.194169,72.0,134.0,97.263158,97.0,127.0,109.833333,56.0,89.0,...,28.4,150.0,15.0,6.0,5.0,4.0,0.0,82.0,1,0
2,1.0,65.602396,60.0,97.0,84.166667,95.0,143.0,112.153846,56.0,99.0,...,26.0,26.0,15.0,6.0,5.0,4.0,0.0,62.1,1,0
3,1.0,64.906629,59.0,87.0,71.461538,113.0,150.0,138.16,60.0,94.0,...,26.1,26.1,15.0,1.0,0.0,1.0,1.0,113.1,1,0
4,1.0,57.438861,57.0,100.0,82.387097,81.0,127.0,97.672131,47.0,95.0,...,27.7,39.4,15.0,6.0,0.0,1.0,1.0,97.4,0,1


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_to_scale = data_encoded.drop(columns=['aki']).columns
data_encoded[features_to_scale] = scaler.fit_transform(data_encoded[features_to_scale])
scaled_data = data_encoded[features_to_scale]
scaled_data.head()

Unnamed: 0,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,dbp_max,dbp_mean,...,ptt_min,ptt_max,gcs_min,gcs_motor,gcs_verbal,gcs_eyes,gcs_unable,weight_admit,gender_F,gender_M
0,0.868796,1.72495,0.07373,1.006689,0.55618,-0.978644,-0.18108,-0.625309,-1.544616,-1.41921,...,-0.195533,-0.413638,0.488466,0.495929,0.766355,0.786547,-0.507323,0.795813,1.124778,-1.124778
1,0.766992,0.138692,1.539611,0.827221,0.213991,-0.934621,-0.566361,0.738435,0.032401,0.556796,...,-0.21329,3.924044,0.488466,0.495929,0.766355,0.786547,-0.507323,0.014652,1.124778,-1.124778
2,0.038221,-0.654437,-0.268309,-0.006201,0.099928,-0.230257,-0.424516,0.738435,0.541117,0.835085,...,-0.426376,-0.539629,0.488466,0.495929,0.766355,0.786547,-0.507323,-0.540531,1.124778,-1.124778
3,-0.002048,-0.720531,-0.756936,-0.814717,1.126496,0.077902,1.165145,1.079371,0.286759,1.445731,...,-0.417498,-0.536029,0.488466,-2.521553,-1.630505,-1.928713,1.971129,0.882299,1.124778,-1.124778
4,-0.434258,-0.852719,-0.121721,-0.119447,-0.698514,-0.934621,-1.309731,-0.028671,0.33763,-0.182005,...,-0.27544,-0.057264,0.488466,0.495929,-1.630505,-1.928713,1.971129,0.44429,-0.889065,0.889065


In [7]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

target = data_encoded['aki']

clf = RandomForestClassifier()
clf = clf.fit(scaled_data, target)
feature_importances = clf.feature_importances_
features = scaled_data.columns
importance_scores = list(zip(features, feature_importances))
importance_scores_sorted = sorted(importance_scores, key=lambda x: x[1], reverse=True)

print("Feature importance score：")
for feature, importance in importance_scores_sorted:
    print(f"{feature}: {importance}")

top_8_features = importance_scores_sorted[:8]
top_8_features_names = [feature[0] for feature in top_8_features]

selected_features_df = scaled_data[top_8_features_names]

print("\Selected Features：")
print(selected_features_df.columns)

Feature importance score：
weight_admit: 0.038453090255683046
admission_age: 0.0324514308201197
bun_min: 0.02859633124151237
bun_max: 0.027733723975923694
sbp_min: 0.02373885782585221
ptt_max: 0.02269032934993728
glucose_max: 0.02202973000977756
pt_max: 0.0220098996758804
sbp_max: 0.019322773215034856
glucose_mean: 0.019223492108668354
resp_rate_mean: 0.018963762151394104
dbp_mean: 0.018853548794150184
spo2_mean: 0.018825013144183958
glucose_max.2: 0.018533432450016582
dbp_min: 0.018509492858864232
mbp_min: 0.018254156566167405
wbc_max: 0.01804890750247864
potassium_max.1: 0.017823165973454166
sbp_mean: 0.01780951330207264
temperature_mean: 0.017701207979225492
heart_rate_mean: 0.017621626015998933
temperature_max: 0.017584430014828383
mbp_mean: 0.017538323941712435
mbp_max: 0.01742867808238563
ptt_min: 0.01710322979826766
platelets_min: 0.01694943056393147
heart_rate_max: 0.01680280011679423
glucose_min: 0.01671495022147774
hemoglobin_min.1: 0.016701765557717962
hematocrit_min.1: 0.016

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(selected_features_df, target, test_size=0.2, random_state=42)

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.7142
              precision    recall  f1-score   support

         0.0       0.62      0.36      0.46      3400
         1.0       0.74      0.89      0.81      6784

    accuracy                           0.71     10184
   macro avg       0.68      0.63      0.63     10184
weighted avg       0.70      0.71      0.69     10184

