In [23]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from sklearn.metrics import roc_auc_score

In [24]:
# a) Retrieve path and load the data
path = os.getcwd()
df = pd.read_csv(path+"/sample_diabetes_mellitus_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,0,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,0,14.1,0,0,0,0,0,0,0,1
1,1,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,1,12.7,0,0,0,0,0,0,0,1
2,2,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,...,0,,0,0,0,0,0,0,0,0
3,3,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,...,1,8.0,0,0,0,0,0,0,0,0
4,4,201746,33,19.0,,0,Caucasian,M,188.0,,...,0,,0,0,0,0,0,0,0,0


In [25]:
# Remove column, which doesn't add any value to the DataFrame
df = df.drop(columns=['Unnamed: 0'])

In [26]:
# Process data (exercises c-f)
def process_data(df, columns_drop, columns_fill, columns_hot, columns_binary):
    df_clean = df.dropna(subset=columns_drop)
    df_clean[columns_fill] = df_clean[columns_fill].apply(lambda col: col.fillna(col.mean()))
    df_clean['Binary'] = df_clean[columns_binary].apply(lambda x: 1 if x == 'M' else 0)
    dummy = pd.get_dummies(df_clean[columns_hot])
    encoded_df = pd.concat([df_clean, dummy], axis=1)
    
    return encoded_df

In [27]:
# Check the results of exercises c-f
columns_drop = ['age', 'gender', 'ethnicity']
columns_fill = ['height', 'weight']
columns_hot = ['ethnicity']
columns_binary = 'gender'
df_processed = process_data(df, columns_drop, columns_fill, columns_hot, columns_binary)
df_processed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[columns_fill] = df_clean[columns_fill].apply(lambda col: col.fillna(col.mean()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Binary'] = df_clean[columns_binary].apply(lambda x: 1 if x == 'M' else 0)


Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,Binary,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown
0,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,Floor,...,0,0,1,1,False,False,True,False,False,False
1,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,Floor,...,0,0,1,0,False,False,True,False,False,False
2,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,...,0,0,0,0,False,False,True,False,False,False
3,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,...,0,0,0,0,False,False,True,False,False,False
4,201746,33,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,...,0,0,0,1,False,False,True,False,False,False


In [28]:
df_processed.describe()

Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,weight,...,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,Binary
count,9368.0,9368.0,9368.0,7746.0,9368.0,9368.0,9368.0,9368.0,9368.0,9368.0,...,7314.0,9368.0,9368.0,9368.0,9368.0,9368.0,9368.0,9368.0,9368.0,9368.0
mean,212345.733668,103.41898,62.394321,29.979575,0.232494,170.137378,105.155956,0.633913,0.0,86.880285,...,11.619673,0.000427,0.018894,0.014304,0.047929,0.007899,0.005551,0.026793,0.233988,0.549424
std,38038.252549,30.65302,16.576126,8.423311,0.422445,10.600618,16.989158,2.16038,0.0,23.413072,...,6.763588,0.02066,0.136158,0.118747,0.213628,0.088531,0.074301,0.161488,0.423387,0.497578
min,147009.0,4.0,16.0,14.844926,0.0,137.2,82.0,-0.224306,0.0,38.6,...,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179317.75,83.0,53.0,24.197271,0.0,162.6,92.0,0.002083,0.0,72.1,...,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,212132.5,118.0,64.0,28.375322,0.0,170.2,99.0,0.010417,0.0,86.880285,...,9.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,245163.25,118.0,75.0,33.939251,0.0,177.8,114.0,0.143056,0.0,96.6,...,14.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,279000.0,198.0,89.0,67.81499,1.0,195.59,171.0,49.523611,0.0,186.0,...,45.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# b) Split the data between train and test.
# Extract the feature data (X) and the target variable (y) from your DataFrame
X = df_processed.drop('diabetes_mellitus', axis=1)  
y = df_processed['diabetes_mellitus']  
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [47]:
# Function to split data (exercise b)
def splitdata(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [48]:
# Call the splitdata function
X_train, X_test, y_train, y_test = splitdata(df_processed, 'diabetes_mellitus')

# Now you can print the values
print(f'X_train: {X_train}')
print(f'X_test: {X_test}')
print(f'y_train: {y_train}')
print(f'y_test: {y_test}')

X_train:       encounter_id  hospital_id   age        bmi  elective_surgery  \
952         160030          149  50.0        NaN                 0   
4283        235736           81  63.0  31.835347                 1   
5521        242191          118  63.0  25.062103                 0   
9655        191087           89  79.0  25.015328                 0   
3549        271596          118  31.0  19.228087                 0   
...            ...          ...   ...        ...               ...   
6062        201939          118  59.0        NaN                 1   
5494        227860           31  67.0  18.301839                 0   
5703        190499          118  85.0  18.296026                 1   
913         165385          118  43.0  23.661272                 1   
7681        242945          118  23.0  22.496401                 0   

             ethnicity gender  height hospital_admit_source  \
952          Caucasian      F  170.20  Emergency Department   
4283          Hispanic  

In [31]:
# g) Train the model

# Step 1: Define the features and target
features = ['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure',
            'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']
target = ['diabetes_mellitus']  
X_train = X_train[features] 
X_test = X_test[features]

# Step 2: Train the model
# For Logistic Regression:
model = LogisticRegression()
# For Random Forest Classifier:
# model = RandomForestClassifier()

model.fit(X_train, y_train)

# Step 3: Make predictions using predict_proba
train_predictions = model.predict_proba(X_train)[:, 1]
test_predictions = model.predict_proba(X_test)[:, 1]

# Step 4: Add the predictions as new columns in the train and test DataFrames
X_train['predictions'] = train_predictions
X_test['predictions'] = test_predictions


In [43]:
# Function to train the model and calculate metrics (exercise d and i)
def model(X_train, X_test, y_train, y_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    train_predictions = model.predict_proba(X_train)[:, 1]
    test_predictions = model.predict_proba(X_test)[:, 1]
    X_train['predictions'] = train_predictions
    X_test['predictions'] = test_predictions
    train_roc_auc = roc_auc_score(y_train, train_predictions)
    test_roc_auc = roc_auc_score(y_test, test_predictions)
    
    print("Train ROC AUC:", train_roc_auc)
    print("Test ROC AUC:", test_roc_auc)
    
    return train_roc_auc, test_roc_auc


In [44]:
# Execute training and metrics calculations
features = ['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure',
            'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']
target = ['diabetes_mellitus']  
X_train = X_train[features] 
X_test = X_test[features]
y_train = y_train
y_test = y_test
result = model(X_train, X_test, y_train, y_test)
print(result)

Train ROC AUC: 0.6703545314832863
Test ROC AUC: 0.6689398850455878
(0.6703545314832863, 0.6689398850455878)


In [34]:
# i) Compute the train and test roc_auc metric

# Compute ROC AUC for the training set
train_roc_auc = roc_auc_score(y_train, train_predictions)

# Compute ROC AUC for the test set
test_roc_auc = roc_auc_score(y_test, test_predictions)

print("Train ROC AUC:", train_roc_auc)
print("Test ROC AUC:", test_roc_auc)


Train ROC AUC: 0.6703545314832863
Test ROC AUC: 0.6689398850455878
