# A preliminary EDA using the UCI diabetes data

> Dataset description: diabetes _ binary _ 5050split _ health _ indicators _ BRFSS2015.csv is a clean dataset of 70,692 survey responses to the CDC's BRFSS2015. It has an equal 50-50 split of respondents with no diabetes and with either prediabetes or diabetes. The target variable Diabetes_binary has 2 classes. 0 is for no diabetes, and 1 is for prediabetes or diabetes. This dataset has 21 feature variables and is balanced.

In [4]:
# loading libraries (likely way too many)
import numpy as np
import pandas as pd
import requests
import zipfile
import altair as alt
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [6]:
diabetes_data = pd.read_csv('data/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')

In [4]:
diabetes_data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [7]:
# Split training and test data
train_df, test_df = train_test_split(diabetes_data, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=["Diabetes_binary"]).values, train_df["Diabetes_binary"].values
X_test, y_test = test_df.drop(columns=["Diabetes_binary"]).values, test_df["Diabetes_binary"].values

cols = train_df.drop(columns=["Diabetes_binary"]).columns
train_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
53382,1.0,1.0,1.0,1.0,27.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,5.0,15.0,30.0,1.0,0.0,8.0,5.0,3.0
28325,0.0,1.0,0.0,1.0,24.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,13.0,5.0,7.0
23675,0.0,0.0,0.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,12.0,5.0,6.0
118,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,10.0,6.0,8.0
18083,0.0,0.0,1.0,1.0,30.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,2.0,0.0,2.0,0.0,0.0,5.0,4.0,2.0


In [8]:
#Dummy Classifier
dummy = DummyClassifier()
scores = cross_validate(dummy, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.018567,0.009596,0.500486,0.500508
1,0.009772,0.002551,0.500486,0.500508
2,0.006158,0.004255,0.500486,0.500508
3,0.006912,0.002668,0.500531,0.500497
4,0.005958,0.002797,0.500531,0.500497


In [8]:
#Logistic Regression using pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [11]:
scores = cross_validate(pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.174175,0.009554,0.75042,0.747624
1,0.097448,0.005492,0.745027,0.748817
2,0.129259,0.006095,0.743347,0.748884
3,0.109605,0.006459,0.747303,0.748226
4,0.124127,0.006846,0.753935,0.746679


In [10]:
# Manually scaling the data
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [13]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
data = {"features": cols, "coefficients": lr.coef_[0]}
pd.DataFrame(data)

Unnamed: 0,features,coefficients
0,HighBP,0.367749
1,HighChol,0.291999
2,CholCheck,0.219338
3,BMI,0.535329
4,Smoker,0.005788
5,Stroke,0.031232
6,HeartDiseaseorAttack,0.095081
7,PhysActivity,-0.012959
8,Fruits,-0.016229
9,Veggies,-0.028698
