**Important: This code will be done without any feature engineering. We are only focused on the concepts of model selection and optimizations.**

## **Cross validation**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('pima-indians-diabetes.csv')
print(data.shape)
data.head()

(768, 9)


Unnamed: 0,Preg,Plas,Pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
X = data.drop(columns='class', axis=1)
y = data['class']

In [4]:
k = 10

kfold = KFold(n_splits=k, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=500)
results = cross_val_score(model, X, y, cv=kfold)
print(results)
print('Accuracy: %.3f%% (%.3f%%)' % (results.mean()*100, results.std()*100))

[0.7012987  0.80519481 0.72727273 0.84415584 0.83116883 0.67532468
 0.85714286 0.77922078 0.69736842 0.78947368]
Accuracy: 77.076% (6.270%)


Our model perform on average 77%. And could vary between 70.806% and 83.346%. 

## **Leave one out cross validation**

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from numpy import mean
from numpy import absolute
from numpy import sqrt
import pandas as pd

In [6]:
data = pd.DataFrame({'x1': [2, 5, 4, 3, 4, 6, 7, 5, 8, 9],
                     'x2': [14, 12, 12, 13, 7, 8, 7, 4, 6, 5],
                     'y': [6, 8, 12, 14, 14, 15, 17, 22, 24, 23]})

data

Unnamed: 0,x1,x2,y
0,2,14,6
1,5,12,8
2,4,12,12
3,3,13,14
4,4,7,14
5,6,8,15
6,7,7,17
7,5,4,22
8,8,6,24
9,9,5,23


In [7]:
X = data.iloc[:, 0:2]
y = data.iloc[:, 2]

In [8]:
# cross validation method

cv = LeaveOneOut()

In [9]:
model = LinearRegression()

In [10]:
# using LOOCV to evaluate the model

scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) 

In [11]:
# Mean Absolute Error

mean(absolute(scores))

3.1461548083469735

## **GridSearchCV for model optimizations**

In [12]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split 

In [13]:
data = pd.read_csv('pima-indians-diabetes.csv')
print(data.shape)
data.head()

(768, 9)


Unnamed: 0,Preg,Plas,Pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
X = data.drop('class', axis=1)
y = data['class']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the validation data from the training data

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [16]:
print('Total dataset:', len(data.values))
print('Total test data:', len(X_test + y_test))
print('Total training data:', len(X_train2 + y_train2))
print('Total validation data:', len(X_val + y_val))

Total dataset: 768
Total test data: 154
Total training data: 460
Total validation data: 154


In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

In [18]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [19]:
param_grid = {'n_neighbors': list(range(1,9)),
              'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
              'leaf_size': [10,20,30,40,50],
              'weights': ('uniform', 'distance')
              }

In [20]:
from sklearn.model_selection import GridSearchCV

GS = GridSearchCV(knn_clf, param_grid, cv=20)

In [21]:
GS.fit(X_train2, y_train2)

GridSearchCV(cv=20, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                         'leaf_size': [10, 20, 30, 40, 50],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8],
                         'weights': ('uniform', 'distance')})

In [22]:
GS.best_params_

{'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 8, 'weights': 'uniform'}

In [23]:
GS.best_estimator_

KNeighborsClassifier(leaf_size=10, n_neighbors=8)

In [24]:
GS.cv_results_['mean_test_score']

array([0.66521739, 0.66521739, 0.68695652, 0.66521739, 0.69347826,
       0.69130435, 0.69347826, 0.68695652, 0.72173913, 0.70869565,
       0.72173913, 0.70869565, 0.72173913, 0.72826087, 0.7326087 ,
       0.72608696, 0.66521739, 0.66521739, 0.68695652, 0.66521739,
       0.69347826, 0.69130435, 0.69347826, 0.68695652, 0.72173913,
       0.70869565, 0.72173913, 0.70869565, 0.72173913, 0.72826087,
       0.7326087 , 0.72608696, 0.66521739, 0.66521739, 0.68695652,
       0.66521739, 0.69347826, 0.69130435, 0.69347826, 0.68695652,
       0.72173913, 0.70869565, 0.72173913, 0.70869565, 0.72173913,
       0.72826087, 0.7326087 , 0.72608696, 0.66521739, 0.66521739,
       0.68695652, 0.66521739, 0.69347826, 0.69130435, 0.69347826,
       0.68695652, 0.72173913, 0.70869565, 0.72173913, 0.70869565,
       0.72173913, 0.72826087, 0.7326087 , 0.72608696, 0.66521739,
       0.66521739, 0.68695652, 0.66521739, 0.69347826, 0.69130435,
       0.69347826, 0.68695652, 0.72173913, 0.70869565, 0.72173

In [25]:
GS.best_score_

0.7326086956521739

In [29]:
# Exporting the results to evaluate which one was the best result

df = pd.DataFrame(GS.cv_results_)
df = df.sort_values('rank_test_score')
df.to_csv('cv_results.csv', encoding='utf-8', sep=',')