In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SequentialFeatureSelector





In [14]:
df1 = pd.read_csv('/Users/talalkhan/Documents/Data Sets/train.csv')
df2 = pd.read_csv('/Users/talalkhan/Documents/Data Sets/test.csv')

In [15]:
#handle catagorical data in both frames
df1 = pd.get_dummies(df1, columns=['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem'])
df2 = pd.get_dummies(df2, columns=['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem'])


In [16]:
#using KNNImputer to handle nan values
#df1
imr = KNNImputer(n_neighbors=2500, weights='uniform')
imr = imr.fit(df1.values)
imputed_data1 = imr.transform(df1.values)
#df2
imr = KNNImputer(n_neighbors=2500, weights='uniform')
imr = imr.fit(df2.values)
imputed_data2 = imr.transform(df2.values)
'''
#using simpleimputer to handle nan values
#df1
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df1.values)
imputed_data1 = imr.transform(df1.values)
#df2
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df2.values)
imputed_data2 = imr.transform(df2.values)'''

# convert the imputed NumPy array back into a Pandas DataFrame
df1 = pd.DataFrame(imputed_data1, columns=df1.columns)
df2 = pd.DataFrame(imputed_data2, columns=df2.columns)

In [17]:
'''#apply robust scare to both frames
scaler = RobustScaler()
df1 = scaler.fit_transform(df1)
df2 = scaler.fit_transform(df2)
'''
#MinMaxscaling 
scaler = MinMaxScaler()
cols_to_scale = df1.columns[df1.columns != 'RecordID']
temp = df1.loc[:, ['RecordID']]
df1 = pd.DataFrame(scaler.fit_transform(df1[cols_to_scale]), columns=scaler.get_feature_names_out())
df1 = pd.concat([temp, df1], axis=1, join='inner')

cols_to_scale = df2.columns[df2.columns != 'RecordID']
temp = df2.loc[:, ['RecordID']]
df2 = pd.DataFrame(scaler.fit_transform(df2[cols_to_scale]), columns=scaler.get_feature_names_out())
df2 = pd.concat([temp, df2], axis=1, join='inner')


In [None]:
df1.isnull().sum()

In [18]:
X = df1.loc[:, df1.columns != 'hospital_death']
y = df1['hospital_death']

#split data into train and test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, random_state=2)


In [None]:
def fit_model(model, model_name):
    model.fit(trainX,trainy)
    md_probs = model.predict_proba(testX)
    md_probs = md_probs[:,1]
    md_auc = roc_auc_score(testy, md_probs)
    print(model_name, " : ", md_auc)
    md_fpr, md_tpr, _ = roc_curve(testy, md_probs)
    pyplot.plot(md_fpr, md_tpr, marker='.', label=model_name)
    #return (md_fpr, md_tpr)

In [None]:
kn = KNeighborsClassifier(n_neighbors=1000, weights='uniform', p=1)
fit_model(kn, "k-NN")

pipe_kn = Pipeline([("scaler", MinMaxScaler()), ("knr", KNeighborsClassifier(n_neighbors=1500))])
fit_model(pipe_kn, "Scaled k-NN")

pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
'''# Define the parameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7,10,30,50],           # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting scheme ('uniform' or 'distance')
    'p': [1, 2]                          # Minkowski distance metric (1 for Manhattan, 2 for Euclidean)
}

# Create the KNeighborsClassifier
knc = KNeighborsClassifier()

# Create the GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=knc, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to your data
grid_search.fit(trainX, trainy)

# Get the best parameters

best_params = grid_search.best_params_
print("Best Parameters:", best_params)


# Get the best estimator (model) with the tuned hyperparameters
best_knc = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_knc.predict(df2)

# Evaluate the model's performance
accuracy = accuracy_score(testy, y_pred)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
'''

In [21]:
knn_test = KNeighborsClassifier(n_neighbors=10, metric='euclidean', p=1)

sfs = SequentialFeatureSelector(knn_test, direction='forward',n_features_to_select=5, scoring='roc_auc')
sfs.fit(X, y)
print(sfs.get_feature_names_out())

KeyboardInterrupt: 

In [None]:
#md_pred = pipe_kn.predict_proba(df2)[:,1]

pred = pipe_kn.predict_proba(df2)[:,1]

In [None]:
# Create a DataFrame for the results with RecordID and predicted probability of death
results_df = pd.DataFrame({'RecordID': df2['RecordID'] , 'ProbaDeath': pred})

# Save the results to a CSV file
results_df.to_csv('submission30_25253.csv', index=False)
