# Model Validation Methods

#### 1.Evaluate using a train and a test set

In [1]:
# Evaluate using a train and a test set
from pandas import read_csv
from sklearn.model_selection import train_test_split #holdout
from sklearn.linear_model import LogisticRegression 

filename = 'https://raw.githubusercontent.com/slmsshk/pima-indians-diabetes.data.csv/main/pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,random_state=0)
model = LogisticRegression(max_iter=400)
model.fit(X_train, Y_train)
train_accuracy=model.score(X_train, Y_train)
test_accuracy=model.score(X_test, Y_test)



In [2]:
print(train_accuracy, test_accuracy)

0.7704280155642024 0.7795275590551181


In [7]:
dataframe['class'].value_counts()

class
0    500
1    268
Name: count, dtype: int64

In [5]:
from sklearn.metrics import classification_report
train_pred=model.predict(X_train)
test_pred=model.predict(X_test)

print(classification_report(Y_train,train_pred))
print(classification_report(Y_test,test_pred))


              precision    recall  f1-score   support

         0.0       0.79      0.87      0.83       330
         1.0       0.72      0.59      0.65       184

    accuracy                           0.77       514
   macro avg       0.76      0.73      0.74       514
weighted avg       0.77      0.77      0.76       514

              precision    recall  f1-score   support

         0.0       0.79      0.91      0.85       170
         1.0       0.73      0.52      0.61        84

    accuracy                           0.78       254
   macro avg       0.76      0.71      0.73       254
weighted avg       0.77      0.78      0.77       254



#### 2.Evaluate using Cross Validation

In [12]:
# Evaluate using Cross Validation
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [10]:
filename = 'https://raw.githubusercontent.com/slmsshk/pima-indians-diabetes.data.csv/main/pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array=dataframe.values
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [12]:
help(cross_val_score)

Help on function cross_val_score in module sklearn.model_selection._validation:

cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=nan)
    Evaluate a score by cross-validation.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    
    X : array-like of shape (n_samples, n_features)
        The data to fit. Can be for example a list, or an array.
    
    y : array-like of shape (n_samples,) or (n_samples, n_outputs),             default=None
        The target variable to try to predict in the case of
        supervised learning.
    
    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" :term:`cv

In [13]:
X = array[:,0:8]
Y = array[:,8]
# num_folds = 10
# seed = 7
kfold = KFold()
model = LogisticRegression(max_iter=500)
results = cross_val_score(model, X, Y,cv=kfold)

In [14]:
results


array([0.77272727, 0.72077922, 0.76623377, 0.82352941, 0.77124183])

In [15]:
results.mean()*100.0

77.0902300314065

In [16]:
results.std()*100.0

3.2579677643937455

#### 3.Evaluate using Leave One Out Cross Validation

In [17]:
df.shape

(767, 9)

In [17]:
# Evaluate using Leave One Out Cross Validation
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut,KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename = 'https://raw.githubusercontent.com/slmsshk/pima-indians-diabetes.data.csv/main/pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
#df=dataframe.drop(0)
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0


In [18]:
array = df.values
X = array[:,0:8]
Y = array[:,8]
lcv = LeaveOneOut()
model = LogisticRegression(max_iter=300)
results = cross_val_score(model, X, Y, cv=lcv)

In [19]:
results

array([1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0.,
       1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1.,
       1., 0., 1., 1., 1.

In [None]:
X.shape

(767, 8)

In [20]:
results.mean()*100.0

77.83572359843546

In [22]:
results.std()*100.0

41.53519581934703