# Cross Validation 

we can minimize the cost function by splitting the dataset.

In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

In [13]:
boston_data = pd.read_csv("Boston1.csv")

In [14]:
boston_data.head(10)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [19]:
boston_data.columns = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","BLACK","LSTAT","MEDV"]

In [20]:
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [21]:
boston_data.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21.0,396.9,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.9,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0
505,0.04741,0.0,11.93,0,0.573,6.03,80.8,2.505,1,273,21.0,396.9,7.88,11.9


Suppose we want to do Linear Regression on our dataset to get an estimate, based on mean squared error, of how well our model will perform on data outside our dataset.

Suppose also that our data is split into three folds: Fold 1, Fold 2, and Fold 3.

In [22]:
X = boston_data.drop('MEDV', axis=1)
y = boston_data.MEDV

In [28]:
kf = KFold(shuffle = True,random_state=72018,n_splits=3)

In [35]:
for train_index ,test_index in kf.split(X):
    print(f'train_index: {train_index[:10]} length={len(train_index)}')
    print(f"Test index: {test_index[:10]} length={len(test_index)}")
    print("________________________________________________________\n")

train_index: [ 1  3  4  5  7  8 10 11 12 13] length=337
Test index: [ 0  2  6  9 15 17 19 23 25 26] length=169
________________________________________________________

train_index: [ 0  2  6  9 10 11 12 13 15 17] length=337
Test index: [ 1  3  4  5  7  8 14 16 22 27] length=169
________________________________________________________

train_index: [0 1 2 3 4 5 6 7 8 9] length=338
Test index: [10 11 12 13 18 20 21 24 28 31] length=168
________________________________________________________



In [49]:
scores = []
lr = LinearRegression()
for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = (X.iloc[train_index, :], 
                                        X.iloc[test_index, :], 
                                        y[train_index], 
                                        y[test_index])
    
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    score = r2_score(y_test.values, y_pred)
    scores.append(score)

print(scores)
print(f"\nMax value is {max(scores)} and the split is {scores.index(max(scores))+1}")

[0.671548890572392, 0.7485800157971634, 0.6976436939974648]

Max value is 0.7485800157971634 and the split is 2


#### 2) Now suppose we want to do the same, but appropriately scaling our data as we go through the folds.

In [56]:
scores = []
lr = LinearRegression()
s = StandardScaler()

for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = (X.iloc[train_index, :], 
                                        X.iloc[test_index, :], 
                                        y[train_index], 
                                        y[test_index])
    
    X_train_s = s.fit_transform(X_train)
    lr.fit(X_train_s, y_train)
    X_test_s = s.transform(X_test)
    y_pred = lr.predict(X_test_s)
    score = r2_score(y_test.values, y_pred)
    scores.append(score)
print(scores)
print(f"\nMax value is {max(scores)} and the split is {scores.index(max(scores))+1}")

[0.6715488905723915, 0.7485800157971653, 0.6976436939974642]

Max value is 0.7485800157971653 and the split is 2


we can see that the r2_score is same for both scaled and non scaled data. From this we can conclude that scaling the data won't increase the interpretebility instead it enhance the model performance.

### `Pipline` and `cross_val_predict`


In [89]:
s = StandardScaler()
lr = LinearRegression()

In [90]:
estimator = Pipeline([("scaler", s),("regression", lr)])

In [91]:
kf

KFold(n_splits=3, random_state=72018, shuffle=True)

In [94]:
predictions = cross_val_predict(estimator, X, y, cv=kf)
display(predictions[:10])

array([28.90432106, 24.33373055, 31.1560187 , 28.43572753, 27.63511527,
       25.01510886, 22.50567647, 18.75776392, 10.77960571, 18.68709575])

In [95]:
r2_score(y, predictions)*100

70.62308732452821

In [98]:
np.mean(scores)*100 ## above score and the mean of the scaled data are almost similar

70.59242001223403

### How to reach me:
✉: raghavendrakn076@gmail.com
📞: +91 9353888374
🔗: [www.raghavendraportfolio.com](https://www.raghavendraportfolio.com)