In [130]:
import pandas as pd
import numpy as np
import pprint

from sklearn.base import is_classifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, check_cv

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble  import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [29]:
data = pd.read_csv("datasets/diabetes.csv")
data.head(7)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1


In [30]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [31]:
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [32]:
#data = data.iloc[:, :8].replace(0, np.nan)
data.iloc[:, :8] = data.iloc[:, :8].replace(0, np.NaN)
data.isna().sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [33]:
# Fill missing values only in Glucose column
print("Glucose mean: {}".format(data.Glucose.mean()))
data[data.Glucose.isna()]

Glucose mean: 121.6867627785059


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
75,1.0,,48.0,20.0,,24.7,0.14,22,0
182,1.0,,74.0,20.0,23.0,27.7,0.299,21,0
342,1.0,,68.0,35.0,,32.0,0.389,22,0
349,5.0,,80.0,32.0,,41.0,0.346,37,1
502,6.0,,68.0,41.0,,39.0,0.727,41,1


In [34]:
data[['Glucose']] = data[['Glucose']].fillna(data.Glucose.mean(), inplace=False)
# Count the number of NaN values in each column
data.isnull().sum()

Pregnancies                 111
Glucose                       0
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [35]:
data.iloc[[75, 182, 342, 349, 502]]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
75,1.0,121.686763,48.0,20.0,,24.7,0.14,22,0
182,1.0,121.686763,74.0,20.0,23.0,27.7,0.299,21,0
342,1.0,121.686763,68.0,35.0,,32.0,0.389,22,0
349,5.0,121.686763,80.0,32.0,,41.0,0.346,37,1
502,6.0,121.686763,68.0,41.0,,39.0,0.727,41,1


In [36]:
# Fill missing values with mean in each column
data.fillna(data.mean(), inplace=True)
print(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [37]:
# Split dataset into inputs and outputs
values = data.values
X = values[:,0:8]
y = values[:,8]

In [108]:
# Initiate the LR model with random hyperparameters
lr = LogisticRegression(penalty='l2', dual=False, max_iter=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

# Pass data to the LR model
lr.fit(X_train, y_train)
print("Score on train data: {:.4}".format(lr.score(X_train, y_train)))
print("Score on test data: {:.4}".format(lr.score(X_test, y_test)))

Score on train data: 0.7938
Score on test data: 0.7323


In [140]:
# You will need the following dependencies for applying Cross-validation and evaluating the cross-validated score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Build the k-fold cross-validator
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

result = cross_val_score(lr, X_train, y_train, cv=kfold, scoring='accuracy')
print("Cross validation score (n_spilts={}): {:.2%}".format(kfold.n_splits, result.mean()))

Cross validation score (n_spilts=5): 78.99%


In [141]:
cv = check_cv(cv=5, y=y_train, classifier=is_classifier(lr))
cv_iter = list(cv.split(X_train, y_train))

result = cross_val_score(lr, X_train, y_train, cv=cv, scoring='accuracy')
print("Cross validation score (n_spilts={}): {:.2%}".format(cv.n_splits, result.mean()))

Cross validation score (n_spilts=5): 79.18%


In [116]:
dual =     [True, False]
max_iter = [300, 400, 500, 700]
C =        [0.001, 0.01, 0.1, 1, 10]

param_grid = dict(dual=dual, max_iter=max_iter, C=C)

grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1)

grid_result = grid.fit(X_train, y_train)
# Summarize results
print("Best: %.2f%% using %s" % (grid_result.best_score_ * 100, grid_result.best_params_))

Best: 79.38% using {'C': 1, 'dual': False, 'max_iter': 300}


### LogisticRegression

In [117]:
# LogisticRegression
steps = [('normalizer', StandardScaler()), ('classifier', LogisticRegression())]
pipe = Pipeline(steps)

param_grid = {
     #'classifier__solver': ['lbfgs'], 
     'classifier__max_iter': max_iter,
     'classifier__dual': dual,
     'classifier__C': C, 
     #'classifier__alpha': 10.0 ** -np.arange(7, 10), 
     #'classifier__hidden_layer_sizes': np.arange(7, 10)
}

#grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, n_jobs=-1)
grid = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, cv=3, n_jobs=-1, random_state=42)
grid_result = grid.fit(X_train, y_train)
# Summarize results
print("Best: %.2f%% using %s" % (grid_result.best_score_ * 100, grid_result.best_params_))

Best: 79.58% using {'classifier__max_iter': 400, 'classifier__dual': False, 'classifier__C': 10}


### RandomForestClassifier

In [124]:
param_grid = {'classifier__max_depth':        [int(x) for x in np.linspace(10, 110, num = 11)],# №[6, 9, None], 
              'classifier__n_estimators':     [50, 70, 100, 150],
              'classifier__max_features':     ['auto', 'sqrt'],
              'classifier__criterion':        ['gini', 'entropy'],
              'classifier__bootstrap':        [True, False],
              'classifier__min_samples_leaf': [1, 2, 3, 4]} 

steps = [('normalizer', StandardScaler()), ('classifier', RandomForestClassifier())]
pipe = Pipeline(steps)
grid = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, cv=3, n_jobs=-1, random_state=42)

grid_result = grid.fit(X_train, y_train)
# Summarize results
pprint.pprint("Best: %.2f%% using %s" % (grid_result.best_score_ * 100, grid_result.best_params_), width=40)

('Best: 77.44% using '
 "{'classifier__n_estimators': 70, "
 "'classifier__min_samples_leaf': 3, "
 "'classifier__max_features': 'sqrt', "
 "'classifier__max_depth': 40, "
 "'classifier__criterion': 'gini', "
 "'classifier__bootstrap': True}")


In [156]:
score = cross_val_score(grid, X_train, y_train, cv=5)
score.mean()

0.7723396154578336

In [157]:
y_pred = grid.predict(X_test)
grid.score(X_test, y_test)

0.7598425196850394