In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,  
    StandardScaler,
    OrdinalEncoder
)
from scipy.stats import loguniform, randint, uniform
from sklearn.metrics import f1_score, recall_score, accuracy_score

In [2]:
# Sample clean data set with relevant features
df = pd.read_csv('../data/clean/LLCP2015_cleaned.csv')

# change target to binary, 1 for diabetes, 0 for non-diabetes
df['Diabetes_012'] = df['Diabetes_012'].replace([1.0], int(0))
df['Diabetes_012'] = df['Diabetes_012'].replace([2.0], int(1))
df['Diabetes_012'] = df['Diabetes_012'].replace([0.0], int(0))

# take random sample of 1000 to train model
df_s = df.sample(1000, random_state=123 )

# train test split, 20% test, 80% train
train_df, test_df = train_test_split(df_s, test_size=0.2, random_state=123)
train_df
#df_mod['Diabetes_012'].unique()


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
199271,0.0,0.0,1.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,4.0,0.0,0.0,10.0,6.0,8.0
54090,0.0,1.0,0.0,1.0,33.0,1.0,1.0,0.0,1.0,1.0,...,1.0,0.0,1.0,15.0,0.0,0.0,1.0,11.0,5.0,6.0
27907,0.0,1.0,0.0,1.0,51.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,4.0,10.0,30.0,0.0,0.0,3.0,6.0,8.0
70159,0.0,0.0,1.0,1.0,30.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,1.0,1.0,9.0,6.0,8.0
95207,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,4.0,30.0,30.0,1.0,0.0,11.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129014,0.0,0.0,0.0,1.0,32.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,3.0,5.0,3.0
101806,0.0,1.0,1.0,1.0,32.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,7.0,0.0,1.0,0.0,9.0,4.0,5.0
103795,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,10.0,6.0,8.0
238611,0.0,0.0,0.0,1.0,21.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,8.0


In [3]:
# Lists of feature names
numeric_features = ['BMI', 'PhysHlth', 'GenHlth', 'Age', 'Education', 'Income', 'MentHlth']
pass_features = train_df.drop(columns=(numeric_features + ['Diabetes_012']) ).columns.to_list()

assert len(numeric_features + pass_features + ['Diabetes_012']) == len(train_df.columns.to_list()), 'feature list is wrong'

# Create the column transformer
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    ('passthrough', pass_features)
)

# Show the preprocessor
preprocessor

In [4]:
# X_train, y_train, X_test, y_test
X_train, X_test = train_df.drop(columns=['Diabetes_012']), test_df.drop(columns=['Diabetes_012'])
y_train, y_test = train_df['Diabetes_012'], test_df['Diabetes_012']

# Fit and transform on the training data
preprocessor.fit(X_train)

# Get transformed data 
new_columns = (numeric_features + pass_features)
preprocessor.verbose_feature_names_out = False

# Create a dataframe with the transformed features and column names
X_train_enc = pd.DataFrame(preprocessor.fit_transform(X_train), index = X_train.index, columns = new_columns)

# Sanity check of transformed data
X_train_enc


Unnamed: 0,BMI,PhysHlth,GenHlth,Age,Education,Income,MentHlth,HighBP,HighChol,CholCheck,...,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,DiffWalk,Sex
199271,-0.862232,-0.022731,-0.486577,0.648420,1.007342,0.945634,-0.430813,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
54090,0.780897,-0.471733,-1.422301,0.976942,-0.014044,0.000591,1.564462,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
27907,3.738531,2.895782,1.384872,-1.651232,1.007342,0.945634,0.899370,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
70159,0.287958,-0.471733,-0.486577,0.319898,1.007342,0.945634,-0.430813,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
95207,-0.533606,2.895782,1.384872,0.976942,-0.014044,-2.362017,3.559736,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129014,0.616584,-0.471733,-0.486577,-1.651232,-0.014044,-1.416974,-0.430813,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
101806,0.616584,-0.471733,-0.486577,0.319898,-1.035431,-0.471931,0.500315,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
103795,-0.697919,-0.471733,-0.486577,0.648420,1.007342,0.945634,-0.430813,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
238611,-1.190858,-0.471733,-0.486577,0.319898,1.007342,0.945634,-0.430813,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [5]:
# create classifiers
models = {
    "Dummy": DummyClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=123),
    "KNN": KNeighborsClassifier(),
    "RBF SVM": SVC(class_weight='balanced', random_state=123),
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, random_state=123)
}

# scoring metrics
score_metrics = ['accuracy', 'recall', 'f1']


In [6]:
# cross validation to get narrow down the better models
cross_val_res = {}

def pipe(model):
    return make_pipeline(preprocessor, model)

for model in models:
    cross_val_res[model] = pd.DataFrame(
        cross_validate(pipe(models[model]), X_train, y_train, cv = 5, n_jobs=-1, return_train_score=True, scoring=score_metrics)
        ).agg('mean').round(3)

# Show the train and validation scores
cross_val_res_df = pd.DataFrame(cross_val_res)

In [7]:
# random search hyperparameter tuning SVM and Logistic Regression
# recall will be prioritized in this model

# models for tuning
models_tune = {
    "RBF SVM": SVC(class_weight='balanced', random_state=123),
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, random_state=123)
}
# hyperparameters to be tuned
svc_param = {
    "svc__C": 10.0 ** np.arange(-20, 10),
    "svc__gamma": 10.0 ** np.arange(-20, 10)
}
log_param = {
    "logisticregression__C": loguniform(1e-3, 1e3),
}
hyper_param = {
    "RBF SVM": svc_param,
    "Logistic Regression": log_param
}

# function to run preprocessor and model
def pipe(model):
    return make_pipeline(preprocessor, model)

# save best estimator to dictionary
rs_results = {}
for model in models_tune:
    random_search = RandomizedSearchCV(
        pipe(models_tune[model]), param_distributions=hyper_param[model], n_jobs=-1, n_iter=100, cv=5, random_state=123, refit=True, scoring='recall')
    random_search.fit(X_train, y_train)
    rs_results[model] = random_search.best_params_

rs_results


{'RBF SVM': {'svc__gamma': 1e-06, 'svc__C': 10000000.0},
 'Logistic Regression': {'logisticregression__C': 0.011290431413903904}}

In [8]:
# add tuned models to results dataframe
# tuned models with best hyperparameters
best_models = {
    "RBF SVM Tune": SVC(class_weight='balanced', random_state=123, C=rs_results['RBF SVM']['svc__C'], gamma=rs_results['RBF SVM']['svc__gamma']),
    "Logistic Regression Tune": LogisticRegression(class_weight='balanced', max_iter=1000, random_state=123, C=rs_results['Logistic Regression']['logisticregression__C']),
    "Logistic Regression + L1": LogisticRegression(class_weight='balanced', max_iter=1000, random_state=123, C=rs_results['Logistic Regression']['logisticregression__C'],
                                                   solver="liblinear", penalty="l1")
    

}

for model in best_models:
    cross_val_res[model] = pd.DataFrame(
        cross_validate(pipe(best_models[model]), X_train, y_train, return_train_score=True, scoring=score_metrics)
    ).agg('mean').round(3)
    
cross_val_res_df = pd.DataFrame(cross_val_res)
cross_val_res_df.style.format(
    precision=2
).background_gradient(
    axis=None
)

Unnamed: 0,Dummy,Decision Tree,KNN,RBF SVM,Logistic Regression,RBF SVM Tune,Logistic Regression Tune,Logistic Regression + L1
fit_time,0.02,0.02,0.02,0.05,0.04,0.09,0.02,0.01
score_time,0.01,0.02,0.26,0.04,0.01,0.02,0.01,0.01
test_accuracy,0.85,0.78,0.84,0.74,0.73,0.68,0.74,0.63
train_accuracy,0.85,1.0,0.88,0.82,0.76,0.71,0.75,0.62
test_recall,0.0,0.4,0.19,0.67,0.71,0.75,0.74,0.84
train_recall,0.0,1.0,0.35,0.93,0.8,0.83,0.78,0.83
test_f1,0.0,0.36,0.26,0.44,0.44,0.41,0.46,0.41
train_f1,0.0,1.0,0.47,0.62,0.5,0.47,0.49,0.4
