# Diabetes prediction from UCI diabetes data

In [None]:
# Importing all used libraries
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
diabetes_df = pd.read_csv("data/diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
diabetes_df.head()

In [None]:
diabetes_df.info()

In [None]:
print(diabetes_df.shape)
diabetes_df.describe().T


In [None]:
# Check for duplicate in dataset
duplicate_rows = diabetes_df.duplicated()
print(duplicate_rows.value_counts())

In [None]:
# Check for imbalance dataset
diabetes_df.drop_duplicates(inplace=True)
diabetes_df["Diabetes_binary"].value_counts()

In [None]:
# Check for null values
diabetes_df.isnull().sum()

In [None]:
#Creating train and test data
train_df, test_df = train_test_split(diabetes_df, test_size = 0.2, random_state=123)

X_train = train_df.drop(columns = "Diabetes_binary")
y_train = train_df["Diabetes_binary"]

X_test = test_df.drop(columns = "Diabetes_binary")
y_test = test_df["Diabetes_binary"]

In [None]:
# plotting histogram distributions
alt.data_transformers.enable("vegafusion")
numeric_cols = train_df.select_dtypes(include=['float64']).columns.to_list()

hist_plot = alt.Chart(train_df).mark_bar(opacity=0.7).encode(
            x=alt.X(alt.repeat(),type='quantitative', bin=alt.Bin(maxbins=20)),
            y=alt.Y('count()').stack(False),
            color=alt.Color('Diabetes_binary:N')
        ).properties(
            width=150,
            height=150
        ).repeat(
            numeric_cols,
            columns=4
        )

hist_plot

In [None]:
#Creating the baseline for our model
dummy = DummyClassifier()
scores = cross_validate(dummy, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

# Model comparison

In [None]:
# Designate binary and continuous cols
binary_cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 
               'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
              'DiffWalk', 'Sex']
continuous_cols = ['BMI', 'Age', 'GenHlth', 'MentHlth', 'PhysHlth', 'Education', 'Income']


In [None]:
# Create a pre-processor which scales the continuous cols
preprocessor = ColumnTransformer(
    transformers=[
        ('continuous', StandardScaler(), continuous_cols),
        ('binary', 'passthrough', binary_cols)
    ])

In [None]:
# Models to test
models = {
    "Dummy": make_pipeline(preprocessor, DummyClassifier()),
    "Decision tree": make_pipeline(preprocessor, DecisionTreeClassifier(random_state=123)),
    "Logistic regression": make_pipeline(preprocessor, LogisticRegression()),
    "Knn": make_pipeline(preprocessor, KNeighborsClassifier())
}

In [None]:
# Evaluate each model
results_dict = {}

for name, pipeline in models.items():
    
    # Cross-validation on training data
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv = 5)
    mean_cv_score = round(cv_scores.mean(), 2)

    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    test_accuracy = round(accuracy_score(y_test, predictions), 2)

    results_dict[name] = (mean_cv_score, test_accuracy)

results_df = pd.DataFrame(list(results_dict.values()), index=results_dict.keys(), columns=['Mean CV Score', 'Test Accuracy'])
results_df

# Feature Importance

In [None]:
# Manually scaling the data
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)

In [None]:
# Show coefficients
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
cols = train_df.drop(columns=["Diabetes_binary"]).columns
data = {"features": cols, "coefficients": lr.coef_[0]}
pd.DataFrame(data)

## Exploring Hyperparameters

While the logistic regression model had the highest accuracy score of the models we explored. However, the knn model was the second best model and had a cross validation accuracy only 0.03 less than the regression model. As such, we will now explore the hyperparameters of the knn model to see if we can improve this score. 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
knn_pipe = make_pipeline(preprocessor, KNeighborsClassifier())

param_grid = {
    "kneighborsclassifier__n_neighbors": [50, 100, 200, 300, 500]
}
first_search = RandomizedSearchCV(knn_pipe, param_distributions=param_grid, n_iter=10, n_jobs= -1, return_train_score=True) 
first_search.fit(X_train, y_train)


In [None]:
print ("the best parameter:", first_search.best_params_)
print ("the best score:", first_search.best_score_)