In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from deap import base, creator, tools, algorithms

In [None]:
data = pd.read_csv('sph6004_assignment1_data.csv')
# One-Hot Encoding for the 'gender' column
gender_encoded = pd.get_dummies(data['gender'], prefix='gender')

# One-Hot Encoding for the 'race' column
race_encoded = pd.get_dummies(data['race'], prefix='race')

# Adding the One-Hot encoded results to the original DataFrame
data_encoded = pd.concat([data, gender_encoded, race_encoded], axis=1)

# Dropping the original 'gender' and 'race' columns
data_encoded.drop(columns=['gender', 'race'], inplace=True)

# Calculating the percentage of missing values for each variable
missing_percentage = (data_encoded.isnull().sum() / len(data_encoded)) * 100

# Identifying variables with missing value percentages exceeding 70%
variables_to_drop = missing_percentage[missing_percentage > 70].index

# Dropping variables with missing values exceeding 70%
df = data_encoded.drop(variables_to_drop, axis=1)
df.head()

In [None]:
# Initializing the KNNImputer object
imputer = KNNImputer(n_neighbors=5)  # The value of n_neighbors can be adjusted as needed

# Filling in the missing values in the DataFrame
df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [None]:
# Initializing StandardScaler
scaler = StandardScaler()

# Standardizing the data
df_final = scaler.fit_transform(df_filled)

# Converting the standardized data back to a DataFrame
df_final = pd.DataFrame(df_final, columns=df_filled.columns)

# Transforming the 'aki' column into binary format for binary classification
df_final['aki'] = df_final['aki'].replace({0: 0, 1: 0, 2: 0, 3: 1})

# Extracting features from the DataFrame
df_feature = df_filled.iloc[:, 1:]

df_feature.head()

In [None]:
# Initializing the logistic regression model
logreg = LogisticRegression()

# Initializing the RFE object, setting the logistic regression model and the number of features to select
rfe = RFE(estimator=logreg, n_features_to_select=30)

# Selecting features
rfe.fit(X, y)

# Getting the indices of the selected features
selected_features_indices = rfe.support_

# Getting the selected features
selected_features = X.columns[selected_features_indices]
selected_features

In [None]:
# L1 regularization
model = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
model.fit(X, y)

# Getting the selected features based on non-zero coefficients
selected_features_1 = X.columns[model.coef_[0] != 0]
selected_features_1

In [None]:
# Genetic Algorithm
df_sampled = df_feature.sample(n=3000, random_state=42)

# Creating feature matrix X and target variable y
X = df_sampled.drop(columns=['aki'])
y = df_sampled['aki'].copy()

# Defining the fitness function
def evaluate(individual, X, y):
    selected_features_0 = [bool(i) for i in individual]
    X_selected = X.iloc[:, selected_features_0]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred),

# Defining genetic algorithm parameters
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.choice, [0, 1])
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate, X=X, y=y)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Adjusting genetic algorithm parameters
population_size = 100
num_generations = 30  # Increasing the number of generations
cxpb = 0.3  # Reducing crossover probability
mutpb = 0.4  # Increasing mutation probability

# Creating the population
population = toolbox.population(n=population_size)

# Running the genetic algorithm
for generation in range(num_generations):
    offspring = algorithms.varAnd(population, toolbox, cxpb=cxpb, mutpb=mutpb)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population = toolbox.select(offspring, k=len(population))

# Getting the best individual
best_individual = tools.selBest(population, k=1)[0]
selected_features = [bool(i) for i in best_individual]
selected_features_names = X.columns[selected_features]
print("Selected features_2:", selected_features_names)


In [None]:
sampled_df = data.sample(n=1000, random_state=42)

# Feature matrix
X = sampled_df.drop(columns=['aki'])

# Target variable
y = sampled_df['aki']

# Initializing the logistic regression model
model = LogisticRegression()

# Initializing the Sequential Feature Selector for forward feature selection
selector = SequentialFeatureSelector(model, scoring='accuracy', cv=5)

# Selecting features
selector.fit(X, y)

# Getting the indices of the selected features
selected_features_indices = selector.get_support(indices=True)

# Getting the selected features
selected_features = X.columns[selected_features_indices]

print("Selected features:", selected_features)


In [None]:
selected_features_2_heri = ['admission_age', 'sbp_max', 'dbp_min', 'mbp_min', 'mbp_max', 'mbp_mean',
       'resp_rate_min', 'resp_rate_mean', 'spo2_mean', 'glucose_min',
       'lactate_min', 'lactate_max', 'ph_min', 'ph_max', 'so2_min', 'so2_max',
       'pco2_max', 'aado2_calc_min', 'pao2fio2ratio_min', 'baseexcess_max',
       'totalco2_min', 'calcium_min', 'glucose_max.1', 'hematocrit_min.1',
       'hemoglobin_max.1', 'platelets_max', 'albumin_min', 'aniongap_min',
       'bicarbonate_min.1', 'bun_min', 'calcium_min.1', 'glucose_min.2',
       'sodium_max.1', 'potassium_min.1', 'potassium_max.1',
       'abs_basophils_min', 'abs_eosinophils_min', 'abs_lymphocytes_min',
       'abs_lymphocytes_max', 'abs_monocytes_min', 'abs_neutrophils_max',
       'inr_min', 'inr_max', 'pt_min', 'ptt_max', 'alp_max',
       'bilirubin_total_max', 'ck_cpk_min', 'ck_cpk_max', 'gcs_eyes',
       'weight_admit', 'gender_M', 'race_AMERICAN INDIAN/ALASKA NATIVE',
       'race_ASIAN - ASIAN INDIAN', 'race_ASIAN - CHINESE',
       'race_ASIAN - KOREAN', 'race_ASIAN - SOUTH EAST ASIAN',
       'race_BLACK/AFRICAN', 'race_HISPANIC OR LATINO',
       'race_HISPANIC/LATINO - CENTRAL AMERICAN',
       'race_HISPANIC/LATINO - CUBAN', 'race_HISPANIC/LATINO - HONDURAN',
       'race_HISPANIC/LATINO - MEXICAN', 'race_HISPANIC/LATINO - SALVADORAN',
       'race_MULTIPLE RACE/ETHNICITY',
       'race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',
       'race_PATIENT DECLINED TO ANSWER', 'race_SOUTH AMERICAN',
       'race_UNABLE TO OBTAIN', 'race_UNKNOWN', 'race_WHITE',
       'race_WHITE - EASTERN EUROPEAN', 'race_WHITE - RUSSIAN']
selected_features_3_genetic = ['admission_age', 'sbp_min', 'sbp_mean', 'dbp_min', 'dbp_max', 'mbp_min',
       'mbp_max', 'mbp_mean', 'temperature_min', 'temperature_max',
       'temperature_mean', 'spo2_max', 'glucose_min', 'ph_min', 'ph_max',
       'so2_max', 'aado2_calc_min', 'calcium_max', 'glucose_min.1',
       'glucose_max.1', 'potassium_min', 'platelets_max', 'albumin_max',
       'glucose_min.2', 'glucose_max.2', 'sodium_min.1', 'potassium_min.1',
       'abs_basophils_max', 'abs_eosinophils_min', 'abs_eosinophils_max',
       'abs_neutrophils_max', 'inr_min', 'inr_max', 'pt_max', 'ast_min',
       'gcs_min', 'gcs_verbal', 'gcs_eyes', 'weight_admit', 'gender_F',
       'gender_M', 'race_AMERICAN INDIAN/ALASKA NATIVE', 'race_ASIAN',
       'race_ASIAN - ASIAN INDIAN', 'race_ASIAN - CHINESE',
       'race_ASIAN - KOREAN', 'race_ASIAN - SOUTH EAST ASIAN',
       'race_BLACK/AFRICAN', 'race_BLACK/AFRICAN AMERICAN',
       'race_BLACK/CAPE VERDEAN', 'race_BLACK/CARIBBEAN ISLAND',
       'race_HISPANIC OR LATINO', 'race_HISPANIC/LATINO - CENTRAL AMERICAN',
       'race_HISPANIC/LATINO - COLUMBIAN', 'race_HISPANIC/LATINO - CUBAN',
       'race_HISPANIC/LATINO - DOMINICAN', 'race_HISPANIC/LATINO - GUATEMALAN',
       'race_HISPANIC/LATINO - HONDURAN', 'race_HISPANIC/LATINO - MEXICAN',
       'race_HISPANIC/LATINO - PUERTO RICAN',
       'race_HISPANIC/LATINO - SALVADORAN', 'race_MULTIPLE RACE/ETHNICITY',
       'race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'race_OTHER',
       'race_PATIENT DECLINED TO ANSWER', 'race_PORTUGUESE',
       'race_SOUTH AMERICAN', 'race_UNABLE TO OBTAIN', 'race_UNKNOWN',
       'race_WHITE - BRAZILIAN', 'race_WHITE - EASTERN EUROPEAN']

selected_features_1_L1 = ['admission_age', 'heart_rate_min', 'heart_rate_max', 'heart_rate_mean',
       'sbp_min', 'sbp_max', 'sbp_mean', 'dbp_min', 'dbp_max', 'dbp_mean',
       'mbp_min', 'mbp_max', 'mbp_mean', 'resp_rate_min', 'resp_rate_max',
       'resp_rate_mean', 'temperature_min', 'spo2_min', 'spo2_max',
       'spo2_mean', 'glucose_min', 'glucose_max', 'glucose_mean',
       'lactate_min', 'so2_min', 'so2_max', 'po2_min', 'po2_max', 'pco2_min',
       'pco2_max', 'aado2_calc_min', 'aado2_calc_max', 'pao2fio2ratio_min',
       'pao2fio2ratio_max', 'baseexcess_min', 'baseexcess_max', 'totalco2_min',
       'totalco2_max', 'glucose_min.1', 'glucose_max.1', 'potassium_max',
       'hematocrit_min.1', 'hematocrit_max.1', 'hemoglobin_min.1',
       'hemoglobin_max.1', 'platelets_min', 'platelets_max', 'wbc_min',
       'wbc_max', 'albumin_min', 'albumin_max', 'aniongap_min', 'aniongap_max',
       'bicarbonate_min.1', 'bicarbonate_max.1', 'bun_min', 'bun_max',
       'calcium_min.1', 'calcium_max.1', 'chloride_max.1', 'glucose_min.2',
       'glucose_max.2', 'sodium_min.1', 'sodium_max.1', 'potassium_max.1',
       'abs_lymphocytes_min', 'abs_neutrophils_min', 'abs_neutrophils_max',
       'pt_min', 'pt_max', 'ptt_min', 'ptt_max', 'alt_min', 'alt_max',
       'alp_min', 'alp_max', 'ast_min', 'ast_max', 'bilirubin_total_max',
       'ck_cpk_min', 'ck_cpk_max', 'gcs_min', 'gcs_motor', 'gcs_verbal',
       'gcs_eyes', 'height', 'weight_admit', 'gender_M']

selected_features_0_RFE = ['sbp_mean', 'dbp_min', 'dbp_mean', 'mbp_mean', 'resp_rate_mean',
       'temperature_min', 'spo2_mean', 'lactate_min', 'hemoglobin_min.1',
       'hemoglobin_max.1', 'albumin_min', 'albumin_max', 'aniongap_max',
       'bicarbonate_min.1', 'bun_max', 'calcium_min.1', 'chloride_max.1',
       'sodium_max.1', 'potassium_max.1', 'abs_neutrophils_min',
       'abs_neutrophils_max', 'inr_max', 'bilirubin_total_min',
       'bilirubin_total_max', 'gcs_verbal', 'gcs_eyes', 'gcs_unable', 'height',
       'weight_admit', 'gender_M']

In [None]:
# Creating feature matrix X based on selected features
X1 = data[selected_features_0_RFE]

# Creating target variable y
y1 = data['aki']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Initializing the logistic regression model
logreg = LogisticRegression()

# Training the model
logreg.fit(X_train, y_train)

# Predicting on the testing set
y_pred = logreg.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Printing the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predicting probabilities on the testing set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Getting the probability of positive class prediction

# Calculating ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Printing ROC-AUC
print("ROC-AUC:", roc_auc)


In [None]:
# Initializing the SVM model
svm_model = SVC()

# Training the model on the training set
svm_model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = svm_model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Printing the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predicting probabilities on the testing set
y_pred_proba = svm_model.decision_function(X_test)  # Getting the decision function values as prediction probabilities

# Calculating ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Printing ROC-AUC
print("ROC-AUC:", roc_auc)


In [None]:
# Initializing the decision tree model
tree_model = DecisionTreeClassifier()

# Training the model on the training set
tree_model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = tree_model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Printing the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predicting probabilities on the testing set
y_pred_proba = tree_model.predict_proba(X_test)[:, 1]  # Getting the probability of positive class prediction

# Calculating ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Printing ROC-AUC
print("ROC-AUC:", roc_auc)


In [None]:
# Initializing the random forest model
rf_model = RandomForestClassifier()

# Training the model on the training set
rf_model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = rf_model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Printing the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predicting probabilities on the testing set
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Getting the probability of positive class prediction

# Calculating ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Printing ROC-AUC
print("ROC-AUC:", roc_auc)

In [None]:
# Initializing the AdaBoost model
adaboost_model = AdaBoostClassifier()

# Training the model on the training set
adaboost_model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = adaboost_model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Printing the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predicting probabilities on the testing set
y_pred_proba = adaboost_model.predict_proba(X_test)[:, 1]  # Getting the probability of positive class prediction

# Calculating ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Printing ROC-AUC
print("ROC-AUC:", roc_auc)


In [None]:
# Initializing the Gradient Boosting Tree model
gbt_model = GradientBoostingClassifier()

# Training the model on the training set
gbt_model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = gbt_model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Printing the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predicting probabilities on the testing set
y_pred_proba = gbt_model.predict_proba(X_test)[:, 1]  # Getting the probability of positive class prediction

# Calculating ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Printing ROC-AUC
print("ROC-AUC:", roc_auc)


In [None]:
#Repeat the above steps for the remaining features

In [None]:
# Feature matrix
X = df_feature.drop(columns=['aki'])

# Target variable
y = df_feature['aki']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree model
clf = DecisionTreeClassifier()

# Use decision tree-based feature selection method
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# Get the indices of selected features
selected_features_indices = selector.get_support()

# Filter features based on the selected feature indices
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# Train the model
clf.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_selected)

# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on the test set:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Feature matrix
X = df_feature.drop(columns=['aki'])

# Target variable
y = df_feature['aki']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the random forest model
clf = RandomForestClassifier()

# Use random forest-based feature selection method
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# Get the indices of selected features
selected_features_indices = selector.get_support()

# Filter features based on the selected feature indices
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# Train the model
clf.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_selected)

# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on the test set:", accuracy)


In [None]:
# Feature matrix
X = df_feature.drop(columns=['aki'])

# Target variable
y = df_feature['aki']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the AdaBoost model
clf = AdaBoostClassifier()

# Use AdaBoost-based feature selection method
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# Get the indices of selected features
selected_features_indices = selector.get_support()

# Filter features based on the selected feature indices
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# Train the model
clf.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_selected)

# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on the test set:", accuracy)


In [None]:
# Feature matrix
X = df_feature.drop(columns=['aki'])

# Target variable
y = df_feature['aki']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Tree model
clf = GradientBoostingClassifier()

# Use Gradient Boosting Tree-based feature selection method
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# Get the indices of selected features
selected_features_indices = selector.get_support()

# Filter features based on the selected feature indices
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# Train the model
clf.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_selected)

# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy on the test set:", accuracy)
