# Model Validation Methods

## 1.Evaluate using a train and a test set

In [1]:
# Evaluate using a train and a test set
from pandas import read_csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [2]:
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
print(f'Rows: {data.shape[0]}\nColumns: {data.shape[1]}')
data.head()

Rows: 768
Columns: 9


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
array= data.values
x = array[:,0:8]
y = array[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
model = LogisticRegression()
model.fit(x_train, y_train)
train_result = model.score(x_train, y_train)
test_result = model.score(x_test, y_test)

In [4]:
print(f'Train: {train_result*100}\nTest: {test_result*100}')

Train: 76.35009310986965
Test: 77.92207792207793


In [5]:
array= data.values
x = array[:,0:8]
y = array[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)
model = LogisticRegression()
model.fit(x_train, y_train)
train_result = model.score(x_train, y_train)
test_result = model.score(x_test, y_test)

In [6]:
print(f'Train: {train_result*100}\nTest: {test_result*100}')

Train: 78.58472998137802
Test: 76.19047619047619


In [7]:
array= data.values
x = array[:,0:8]
y = array[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3)
model = LogisticRegression()
model.fit(x_train, y_train)
train_result = model.score(x_train, y_train)
test_result = model.score(x_test, y_test)

In [8]:
print(f'Train: {train_result*100}\nTest: {test_result*100}')

Train: 79.14338919925513
Test: 75.32467532467533


### With changes in random state, the accuracy of the data also changes.

## 2.Evaluate using Cross Validation

In [18]:
# Evaluate using Cross Validation
from sklearn.model_selection import KFold, cross_val_score

kfold = KFold(n_splits=10, shuffle = True, random_state=42)
model = LogisticRegression(max_iter=400)
results = cross_val_score(model, x, y , cv = kfold)

In [19]:
results

array([0.7012987 , 0.80519481, 0.72727273, 0.84415584, 0.83116883,
       0.67532468, 0.85714286, 0.77922078, 0.69736842, 0.78947368])

In [20]:
results.mean()*100.0

77.0762132604238

In [22]:
results.var()*100

0.39315957318574507

In [21]:
results.std()*100.0

6.270243800569042

## 3.Evaluate using Leave One Out Cross Validation

In [23]:
# Evaluate using Leave One Out Cross Validation
from sklearn.model_selection import LeaveOneOut
loocv = LeaveOneOut()
model = LogisticRegression(max_iter=300)
results = cross_val_score(model, x , y , cv = loocv)

In [24]:
results

array([1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
       1., 1., 0., 1., 1.

In [25]:
results.mean()*100.0

77.60416666666666

In [26]:
results.var()*100

17.380099826388886

In [27]:
results.std()*100.0

41.68944689773287