<a href="https://colab.research.google.com/github/Nidhi89717/ML/blob/main/03-Cross-Val-and-LinReg-Project/01_Cross_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Cross Validation

https://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
df = pd.read_csv('gdrive/My Drive/csv_files/Advertising.csv')

In [4]:
df

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


----
----
----
## Train | Test Split Procedure 

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Test Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Test Data (by creating predictions and comparing to Y_test)
7. Adjust Parameters as Necessary and repeat steps 5 and 6

In [5]:
X = df.drop('sales',axis=1)

In [6]:
y = df['sales']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(X_train)

StandardScaler()

In [12]:
X_train = scaler.transform(X_train)

In [13]:
X_test = scaler.transform(X_test)

**Create Model**

In [14]:
from sklearn.linear_model import Ridge

In [15]:
model = Ridge(alpha=100)

In [16]:
model.fit(X_train,y_train)

Ridge(alpha=100)

In [17]:
y_pred = model.predict(X_test)

**Evaluation**

In [18]:
from sklearn.metrics import mean_squared_error

In [19]:
mean_squared_error(y_test,y_pred)

7.34177578903413

**Adjust Parameters and Re-evaluate**

In [20]:
model_two = Ridge(alpha=1)

In [21]:
model_two.fit(X_train,y_train)

Ridge(alpha=1)

In [22]:
y_pred_two = model_two.predict(X_test)

**Another Evaluation**

In [23]:
mean_squared_error(y_test,y_pred_two)

2.319021579428752

----
----
----
## Train | Validation | Test Split Procedure 

This is often also called a "hold-out" set, since one should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [24]:
X = df.drop('sales',axis=1)

In [25]:
y = df['sales']

In [26]:
from sklearn.model_selection import train_test_split

**SPLIT TWICE! Here we create TRAIN | VALIDATION | TEST** 

In [27]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101)

In [28]:
X_eval,X_test,y_eval,y_test = train_test_split(X_other,y_other,test_size=0.5,random_state=101)

In [29]:
len(X)

200

In [30]:
len(X_train)

140

In [31]:
len(X_eval)

30

In [32]:
len(X_test)

30

In [33]:
from sklearn.preprocessing import StandardScaler

In [34]:
scaler = StandardScaler()

In [35]:
scaler.fit(X_train)

StandardScaler()

In [36]:
X_train = scaler.transform(X_train)

In [37]:
X_test = scaler.transform(X_test)

In [38]:
X_eval = scaler.transform(X_eval)

**Create Model**

In [39]:
from sklearn.linear_model import Ridge

In [40]:
model_one = Ridge(alpha=100)

In [41]:
model_one.fit(X_train,y_train)

Ridge(alpha=100)

In [42]:
y_eval_predict = model_one.predict(X_eval)

**Evaluation**

In [43]:
from sklearn.metrics import mean_squared_error

In [44]:
mean_squared_error(y_eval,y_eval_predict)

7.320101458823871

**Adjust Parameters and Re-evaluate**

In [45]:
model_two = Ridge(alpha=1)

In [46]:
model_two.fit(X_train,y_train)

Ridge(alpha=1)

In [47]:
new_predict_eval = model_two.predict(X_eval)

**Another Evaluation**

In [48]:
mean_squared_error(y_eval,new_predict_eval)

2.383783075056986

In [49]:
y_final_test_pred = model_two.predict(X_test)

**Final Evaluation (Can no longer edit parameters after this!)**

In [50]:
mean_squared_error(y_test,y_final_test_pred)

2.2542600838005176

----
----
----
## Cross Validation with cross_val_score

----



In [51]:
X =df.drop('sales',axis=1)

In [52]:
y = df['sales']

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [55]:
from sklearn.preprocessing import StandardScaler 

In [56]:
scaler = StandardScaler()

In [57]:
scaler.fit(X_train)

StandardScaler()

In [58]:
X_train = scaler.transform(X_train)

In [59]:
X_test = scaler.transform(X_test)

In [60]:
model = Ridge(alpha=100)

In [61]:
from sklearn.model_selection import cross_val_score

In [62]:
scores = cross_val_score(model,X_train,y_train,scoring='neg_mean_squared_error',cv=5)

In [63]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [64]:
abs(scores.mean())

8.215396464543607

**Adjust model based on metrics**

In [65]:
model2 = Ridge(alpha=1)

In [66]:
scores = cross_val_score(model2,X_train,y_train,scoring='neg_mean_squared_error',cv=5)

In [67]:
abs(scores.mean())

3.344839296530695

**Final Evaluation (Can no longer edit parameters after this!)**

In [68]:
model2.fit(X_train,y_train)

Ridge(alpha=1)

In [69]:
y_final_test_pred = model2.predict(X_test)

In [70]:
mean_squared_error(y_final_test_pred,y_test)

2.319021579428752

----
----
----

# Cross Validation with cross_validate

The cross_validate function differs from cross_val_score in two ways:

It allows specifying multiple metrics for evaluation.

It returns a dict containing fit-times, score-times (and optionally training scores as well as fitted estimators) in addition to the test score.

For single metric evaluation, where the scoring parameter is a string, callable or None, the keys will be:
        
        - ['test_score', 'fit_time', 'score_time']

And for multiple metric evaluation, the return value is a dict with the following keys:

    ['test_<scorer1_name>', 'test_<scorer2_name>', 'test_<scorer...>', 'fit_time', 'score_time']

return_train_score is set to False by default to save computation time. To evaluate the scores on the training set as well you need to be set to True.

In [71]:
X = df.drop('sales',axis=1)
y = df['sales']

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [74]:
from sklearn.preprocessing  import StandardScaler

In [75]:
scaler = StandardScaler()

In [76]:
scaler.fit(X_train)

StandardScaler()

In [77]:
X_train = scaler.transform(X_train)

In [79]:
X_test = scaler.transform(X_test)

In [80]:
from sklearn.model_selection import cross_validate

In [81]:
model = Ridge(alpha=100)

In [82]:
scores = cross_validate(model,X_train,y_train,scoring=['neg_mean_squared_error','neg_mean_absolute_error'],cv=10)

In [83]:
scores = pd.DataFrame(scores)

In [84]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.013676,0.002814,-6.060671,-1.810212
1,0.001862,0.00118,-10.627031,-2.541958
2,0.001682,0.001136,-3.993426,-1.469594
3,0.001554,0.001139,-5.009494,-1.862769
4,0.001563,0.001114,-9.1418,-2.520697
5,0.001591,0.001143,-13.086256,-2.459995
6,0.001606,0.00139,-3.839405,-1.451971
7,0.00191,0.001172,-9.058786,-2.377395
8,0.001728,0.001235,-9.055457,-2.443344
9,0.002154,0.001268,-5.778882,-1.899797


In [85]:
scores.mean()

fit_time                        0.002932
score_time                      0.001359
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [86]:
model =Ridge(alpha=1)

In [87]:
scores = cross_validate(model,X_train,y_train,scoring=['neg_mean_squared_error','neg_mean_absolute_error'],cv=10)

In [88]:
scores = pd.DataFrame(scores)

In [89]:
scores.mean()

fit_time                        0.002098
score_time                      0.001292
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

**Final Evaluation (Can no longer edit parameters after this!)**

In [90]:
model.fit(X_train,y_train)

Ridge(alpha=1)

In [91]:
y_final_pred = model.predict(X_test)

In [92]:
mean_squared_error(y_final_pred, y_test)

2.319021579428752