# Diabetes prediction from UCI diabetes data

In [None]:
# Importing all used libraries
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
diabetes_df = pd.read_csv("data/diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
diabetes_df.head()

In [None]:
diabetes_df.info()

In [None]:
print(diabetes_df.shape)
diabetes_df.describe().T


In [None]:
# Check for duplicate in dataset
duplicate_rows = diabetes_df.duplicated()
print(duplicate_rows.value_counts())

In [None]:
# Check for imbalance dataset
diabetes_df.drop_duplicates(inplace=True)
diabetes_df["Diabetes_binary"].value_counts()

In [None]:
# Check for null values
diabetes_df.isnull().sum()

In [None]:
#Creating train and test data
train_df, test_df = train_test_split(diabetes_df, test_size = 0.2, random_state=123)

X_train = train_df.drop(columns = "Diabetes_binary")
y_train = train_df["Diabetes_binary"]

X_test = test_df.drop(columns = "Diabetes_binary")
y_test = test_df["Diabetes_binary"]

In [None]:
# plotting histogram distributions
alt.data_transformers.enable("vegafusion")
numeric_cols = train_df.select_dtypes(include=['float64']).columns.to_list()

hist_plot = alt.Chart(train_df).mark_bar(opacity=0.7).encode(
            x=alt.X(alt.repeat(),type='quantitative', bin=alt.Bin(maxbins=20)),
            y=alt.Y('count()').stack(False),
            color=alt.Color('Diabetes_binary:N')
        ).properties(
            width=150,
            height=150
        ).repeat(
            numeric_cols,
            columns=4
        )

hist_plot

In [None]:
#Creating the baseline for our model
dummy = DummyClassifier()
scores = cross_validate(dummy, X_train, y_train, return_train_score=True)
pd.DataFrame(scores)

# Model comparison

In [None]:
# Designate binary and continuous cols
binary_cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 
               'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
              'DiffWalk', 'Sex']
continuous_cols = ['BMI', 'Age', 'GenHlth', 'MentHlth', 'PhysHlth', 'Education', 'Income']


In [None]:
# Create a pre-processor which scales the continuous cols
preprocessor = ColumnTransformer(
    transformers=[
        ('continuous', StandardScaler(), continuous_cols),
        ('binary', 'passthrough', binary_cols)
    ])

In [None]:
# Models to test
models = {
    "Dummy": make_pipeline(preprocessor, DummyClassifier()),
    "Decision tree": make_pipeline(preprocessor, DecisionTreeClassifier(random_state=123)),
    "Logistic regression": make_pipeline(preprocessor, LogisticRegression()),
    "Knn": make_pipeline(preprocessor, KNeighborsClassifier())
}

In [None]:
#Below is a function from the DSCI 571 Lecture notes which we will use for cross validation. 
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)
#The below code is adapted from DSCI 571 lecture notes and lab solutions. 

# Evaluate each model
results = {}
for name, pipeline in models.items():
    # Cross-validation on training data
    results[name] = mean_std_cross_val_scores(
        pipeline, X_train, y_train, cv=10, return_train_score=True, 
    )

results_df = pd.DataFrame(results).T
results_df


# Feature Importance

In [None]:
# Manually scaling the data
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)

In [None]:
# Show coefficients
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
cols = train_df.drop(columns=["Diabetes_binary"]).columns
data = {"features": cols, "coefficients": lr.coef_[0]}
pd.DataFrame(data)