In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from sklearn.metrics import roc_auc_score

In [2]:
# a) Retrieve path and load the data
path = os.getcwd()
df = pd.read_csv(path+"/sample_diabetes_mellitus_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,0,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,0,14.1,0,0,0,0,0,0,0,1
1,1,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,1,12.7,0,0,0,0,0,0,0,1
2,2,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,...,0,,0,0,0,0,0,0,0,0
3,3,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,...,1,8.0,0,0,0,0,0,0,0,0
4,4,201746,33,19.0,,0,Caucasian,M,188.0,,...,0,,0,0,0,0,0,0,0,0


In [3]:
# Remove column, which doesn't add any value to the DataFrame
df = df.drop(columns=['Unnamed: 0'])

In [4]:
# Process data (exercises c-f)
def process_data(df, columns_drop, columns_fill, columns_hot, columns_binary):
    df_clean = df.dropna(subset=columns_drop)
    df_clean[columns_fill] = df_clean[columns_fill].apply(lambda col: col.fillna(col.mean()))
    df_clean['Binary'] = df_clean[columns_binary].apply(lambda x: 1 if x == 'M' else 0)
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(df_clean[columns_hot]).toarray()
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_hot))
    df_clean.reset_index(drop=True, inplace=True)  # Reset the index and drop the old index column
    encoded_df = pd.concat([df_clean, encoded_df], axis=1)
    
    return encoded_df

In [5]:
# Check the results of exercises c-f
columns_drop = ['age', 'gender', 'ethnicity']
columns_fill = ['height', 'weight']
columns_hot = ['ethnicity']
columns_binary = 'gender'
df_processed = process_data(df, columns_drop, columns_fill, columns_hot, columns_binary)
df_processed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[columns_fill] = df_clean[columns_fill].apply(lambda col: col.fillna(col.mean()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Binary'] = df_clean[columns_binary].apply(lambda x: 1 if x == 'M' else 0)


Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,Binary,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown
0,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,Floor,...,0,0,1,1,0.0,0.0,1.0,0.0,0.0,0.0
1,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,Floor,...,0,0,1,0,0.0,0.0,1.0,0.0,0.0,0.0
2,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
3,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
4,201746,33,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,...,0,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0


In [6]:
# b) Split the data between train and test.
# Extract the feature data (X) and the target variable (y) from your DataFrame
X = df_processed.drop('diabetes_mellitus', axis=1)  
y = df_processed['diabetes_mellitus']  
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# g) Train the model

# Step 1: Define the features and target
features = ['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure',
            'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']
target = ['diabetes_mellitus']  
X_train = X_train[features] 
X_test = X_test[features]

# Step 2: Train the model
# For Logistic Regression:
model = LogisticRegression()
# For Random Forest Classifier:
# model = RandomForestClassifier()

model.fit(X_train, y_train)

# Step 3: Make predictions using predict_proba
train_predictions = model.predict_proba(X_train)[:, 1]
test_predictions = model.predict_proba(X_test)[:, 1]

# Step 4: Add the predictions as new columns in the train and test DataFrames
X_train['predictions'] = train_predictions
X_test['predictions'] = test_predictions


In [8]:
# i) Compute the train and test roc_auc metric

# Compute ROC AUC for the training set
train_roc_auc = roc_auc_score(y_train, train_predictions)

# Compute ROC AUC for the test set
test_roc_auc = roc_auc_score(y_test, test_predictions)

print("Train ROC AUC:", train_roc_auc)
print("Test ROC AUC:", test_roc_auc)


Train ROC AUC: 0.6703545314832863
Test ROC AUC: 0.6689398850455878
