In [1]:
import pandas as pd
import numpy as np

# importing ridge regression as our model as it contains basic(1) hyperparameter to tune (for learning)
from sklearn.linear_model import Ridge

# basic libraires data transformation for modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# model metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

# cross_val_score to perform cross validation using single scoring metrics
from sklearn.model_selection import cross_val_score

# cross_validate to perform corss validation using multiple scoring metrics 
from sklearn.model_selection import cross_validate

# used to perform grid search for a model to find out best combination of parameters 
from sklearn.model_selection import GridSearchCV

# elastic net model used to practice grid search as it contains 2 parameters 
from sklearn.linear_model import ElasticNet

In [2]:
df = pd.read_csv("../DATA/Advertising.csv")

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## cross_val_score
- cross_val_score (estimator, x_train, y_train, scoring = 'desired scoring method', cv = number of k fold cross validation to be performed)
- the cross_val_score takes the model(estimator), x_train, y_train, scoring method and cv count and perfroms cross validation of data, fits it into the model and checks the model perfromance based on passed score

In [4]:
## CREATE X and y
x = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [5]:
model_3 = Ridge(alpha = 100)

In [6]:
score = cross_val_score(model_3,x_train,y_train, scoring='neg_mean_squared_error',cv = 5)

In [7]:
# to view the  the negative mean_squared error value for evey fold
score

array([-3.1574411 , -1.61190525, -5.37588672, -2.23984591, -4.3264032 ])

In [8]:
# as the lower mean square error means better performance 
abs(score.mean())

3.342296435841239

In [9]:
# retune the alpha value and rerun the model
model_4 = Ridge(alpha=0.001)

In [10]:
score_2 = cross_val_score(model_4,x_train,y_train, scoring='neg_mean_squared_error',cv = 5)
score_2

array([-3.1393286 , -1.62246299, -5.37383749, -2.24224624, -4.34167069])

In [11]:
abs(score_2.mean())

3.343909203511349

- MODEL SLIGHTY PERFORMS BETTER

- THIS IS HOW WE PERFORM CROSS VALIDATION DIRECTLY USING A MODEL

## cross_validate
- The cross_validate function differs from cross_val_score in two ways:
- It allows specifying multiple scoring metrics for evaluation.
- It returns a dict containing fit-times, score-times (and optionally training scores as well as fitted estimators) in addition to the test score

##### cross_validate (estimator,x_train,y_train, scoring = ['scoring1 ','scoring 2',....], cv = 5)
- IT TAKES ESTIMATOR, X & Y TRAIN AND A LIST OF MULTIPLE SCORING METRICS AND NUMBER OF FOLDS IN VALIDATION PROCESS
- it trains the model on specified(cv) numner of validation set and measures the performace based on passed scoring metrics

In [12]:
model_5 = Ridge(alpha = 100)

In [13]:
scores = cross_validate(model_4,x_train,y_train,
                         scoring=['neg_mean_absolute_error','neg_mean_squared_error','max_error'],cv=5)

In [14]:
scores = pd.DataFrame(scores)

In [15]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_max_error
0,0.004007,0.002999,-1.545316,-3.139329,-3.034514
1,0.002514,0.002007,-1.030716,-1.622463,-2.910504
2,0.002,0.002,-1.387854,-5.373837,-9.34373
3,0.003534,0.005022,-1.169165,-2.242246,-4.011451
4,0.000995,0.002511,-1.467605,-4.341671,-6.453912


In [16]:
scores.mean()

fit_time                        0.002610
score_time                      0.002908
test_neg_mean_absolute_error   -1.320131
test_neg_mean_squared_error    -3.343909
test_max_error                 -5.150822
dtype: float64

In [17]:
model_5 = Ridge(alpha = 0.1)
scores_2 = cross_validate(model_5,x_train,y_train,
scoring=['neg_mean_absolute_error','neg_mean_squared_error','max_error'],cv=5)
scores_2 = pd.DataFrame(scores_2)

In [18]:
scores_2.mean()

fit_time                        0.002420
score_time                      0.002505
test_neg_mean_absolute_error   -1.320132
test_neg_mean_squared_error    -3.343907
test_max_error                 -5.150809
dtype: float64