# Diabetes Classification

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Visualization

In [None]:
df['Outcome'].value_counts()

In [None]:
sns.countplot(df['Outcome'], label='Count') 

In [None]:
df.hist(figsize=(20,20))

In [None]:
sns.pairplot(df, hue='Outcome')

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(df.corr(), annot=True, fmt= '.2f', cmap='coolwarm_r')

### Preprocess

In [None]:
# 0 = NaN
df.iloc[:,1:6] = df.iloc[:,1:6].replace(0, np.NaN)
df.dropna(thresh=2, axis=0, inplace=True)

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(df.iloc[:,1:6])
df.iloc[:,1:6] = imputer.transform(df.iloc[:,1:6])

In [None]:
X = df.drop(['Outcome'], axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Models Training + Evaluation

In [None]:
models = [
    KNeighborsClassifier(), 
    LogisticRegression(), 
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

In [None]:
tr_names = []
tr_scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    tr_names.append(m.__class__.__name__)
    tr_scores.append(accuracy_score(y_test, y_pred))

tr_df = pd.DataFrame({'Name': tr_names, 'Score': tr_scores})
print(tr_df)

In [None]:
# K-fold Cross Validation 

cv_names = []
cv_scores = []

for m in models:
    classifier_pipeline = make_pipeline(StandardScaler(), m)
    scores = cross_val_score(classifier_pipeline, X, y, cv=5)
    cv_names.append(m.__class__.__name__)
    cv_scores.append(scores.mean())
    
cv_df = pd.DataFrame({'Name': cv_names, 'Mean_Score': cv_scores})
print(cv_df)

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
grid = GridSearchCV(LogisticRegression(), param_grid)
grid.fit(X, y)

In [None]:
print("Best score: %0.2f%%" % (100*grid.best_score_))
print("Best estimator for parameter C: %f" % (grid.best_estimator_.C))

classifier_pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=1000))
scores_final = cross_val_score(classifier_pipeline, X, y, cv=5).mean()
print("Final Logistic Regression model Accuracy: %0.2f%%" %(100*scores_final))

In [None]:
'''
Inspiration
1. https://github.com/surabhim/Diabetes/blob/master/Diabetes.ipynb
'''