In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

In [15]:
df1 = pd.read_csv("/Users/talalkhan/Documents/Data Sets/train.csv")
df2 = pd.read_csv("/Users/talalkhan/Documents/Data Sets/test.csv")


In [16]:
#do onehot encoding for categorical columns
df1 = pd.get_dummies(df1)
df2 = pd.get_dummies(df2)



In [4]:
# List of categorical columns to label encode
categorical_columns = ['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem']

# Initialize a LabelEncoder for each categorical column
label_encoders = {}
#label_encoders2 = {}

for column in categorical_columns:
    le = LabelEncoder()
    df1[column] = le.fit_transform(df1[column])
    label_encoders[column] = le

for column in categorical_columns:
    le2 = LabelEncoder()
    df2[column] = le2.fit_transform(df2[column])
    label_encoders[column] = le2

In [None]:
#drop colums with low correlation RecordID,encounter_id,patient_id,hospital_id,icu_id
df1 = df1.drop(['ventilated_apache','apache_4a_hospital_death_prob','icu_stay_type_readmit', 'apache_3j_bodysystem_Gynecological', 'apache_2_bodysystem_Undefined Diagnoses'], axis=1)
df2 = df2.drop(['ventilated_apache', 'apache_4a_hospital_death_prob','icu_stay_type_readmit', 'apache_3j_bodysystem_Gynecological', 'apache_2_bodysystem_Undefined Diagnoses'], axis=1)


In [17]:
#using simpleimputer to handle nan values
#df1
imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr = imr.fit(df1.values)
imputed_data1 = imr.transform(df1.values)
#df2
imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr = imr.fit(df2.values)
imputed_data2 = imr.transform(df2.values)

#using KNNImputer to handle nan values
#df1
'''
imr = KNNImputer(n_neighbors=2500, weights='uniform')
imr = imr.fit(df1.values)
imputed_data1 = imr.transform(df1.values)
#df2
imr = KNNImputer(n_neighbors=2500, weights='uniform')
imr = imr.fit(df2.values)
imputed_data2 = imr.transform(df2.values)
'''
# convert the imputed NumPy array back into a Pandas DataFrame
df1 = pd.DataFrame(imputed_data1, columns=df1.columns)
df2 = pd.DataFrame(imputed_data2, columns=df2.columns)


In [5]:
#MinMaxscaling 
scaler = MinMaxScaler()
cols_to_scale = df1.columns[df1.columns != 'RecordID']
temp = df1.loc[:, ['RecordID']]
df1 = pd.DataFrame(scaler.fit_transform(df1[cols_to_scale]), columns=scaler.get_feature_names_out())
df1 = pd.concat([temp, df1], axis=1, join='inner')

cols_to_scale = df2.columns[df2.columns != 'RecordID']
temp = df2.loc[:, ['RecordID']]
df2 = pd.DataFrame(scaler.fit_transform(df2[cols_to_scale]), columns=scaler.get_feature_names_out())
df2 = pd.concat([temp, df2], axis=1, join='inner')

In [18]:
#Rohbust scaling 
scaler = RobustScaler()
cols_to_scale = df1.columns[df1.columns != 'RecordID']
temp = df1.loc[:, ['RecordID']]
df1 = pd.DataFrame(scaler.fit_transform(df1[cols_to_scale]), columns=scaler.get_feature_names_out())
df1 = pd.concat([temp, df1], axis=1, join='inner')

cols_to_scale = df2.columns[df2.columns != 'RecordID']
temp = df2.loc[:, ['RecordID']]
df2 = pd.DataFrame(scaler.fit_transform(df2[cols_to_scale]), columns=scaler.get_feature_names_out())
df2 = pd.concat([temp, df2], axis=1, join='inner')

In [19]:
X = df1.loc[:, df1.columns != 'hospital_death']
y = df1['hospital_death']

#split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #random_state=42)

#create decision tree classifier
clf = RandomForestClassifier(
    n_estimators=1000,  # Increase the number of trees
    max_depth=30,       # Limit tree depth to control overfitting
    min_samples_split=10,  # Minimum samples required to split
    min_samples_leaf=2,    # Minimum samples required at a leaf node
    max_features='sqrt',   # Randomly select a subset of features
    random_state=42,
    n_jobs=-1
)
#clf.fit(X_train, y_train)

In [None]:
'''param_grid = {
    'n_estimators': [1000, 1500,2000],
    'max_depth': [20, 30, 50],
    'min_samples_split': [5, 7, 9],
    'min_samples_leaf': [250,500,600],
    'max_features': ['auto', 'sqrt', 'log2'],
    'n_jobs':[-1]
}

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)'''

In [20]:
#adjust hyperparameters
#clf.set_params(max_depth=9, min_samples_split=7, min_samples_leaf=600)

#train model
clf.fit(X_train, y_train)

In [21]:
#make predictions
y_pred = clf.predict(X_test)
md_pred = clf.predict_proba(df2)

In [22]:
pred = md_pred[:,1]
#print mdpred up to 6 decimal places
print(md_pred[:,0].round(6))
#calculate accuracy
score = accuracy_score(y_test, y_pred)
print('Accuracy: %.3f' % score)

print(classification_report(y_test, y_pred))

[0.989388 0.423981 0.896323 ... 0.981297 0.982738 0.982834]
Accuracy: 0.923
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      9099
         1.0       0.73      0.23      0.35       901

    accuracy                           0.92     10000
   macro avg       0.83      0.61      0.66     10000
weighted avg       0.91      0.92      0.90     10000



In [13]:
# Create a DataFrame for the results with RecordID and predicted probability of death
results_df = pd.DataFrame({'RecordID': df2['RecordID'] , 'ProbaDeath': pred})

# Save the results to a CSV file
results_df.to_csv('submission81_25253.csv', index=False)
