In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Purposes of GridSearchCv and KFoldCV

### GridSearchCV
- Used to tune hyperparameters by searching through a specified grid of values.
- Uses cross-validation internally to evaluate each combination.
- Helps improve model performance by selecting the best parameter set.
    
### KFold Cross-Validation
- Used to evaluate model performance on different subsets of data.
- Helps reduce overfitting by ensuring the model is tested on multiple data splits.
- Does not tune hyperparameters but rather provides a robust estimate of model accuracy.


In [4]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the dataset
california = fetch_california_housing()

# Convert to a Pandas DataFrame
df = pd.DataFrame(california.data, columns=california.feature_names)

# Add the target column and rename it to 'Price'
df['Price'] = california.target

# Drop the specified columns
df.drop(columns=['Longitude', 'HouseAge', 'AveBedrms'], inplace=True)

# Display the first 5 rows
print(df.head())


   MedInc  AveRooms  Population  AveOccup  Latitude  Price
0  8.3252  6.984127       322.0  2.555556     37.88  4.526
1  8.3014  6.238137      2401.0  2.109842     37.86  3.585
2  7.2574  8.288136       496.0  2.802260     37.85  3.521
3  5.6431  5.817352       558.0  2.547945     37.85  3.413
4  3.8462  6.281853       565.0  2.181467     37.85  3.422


In [5]:
# Select all features except 'Price' as X
X = df.drop(columns=['Price'])

# The target variable y remains 'Price'
y = df['Price']


# For simple linear regression model we need MedInc as feature and Price as Target 

# Training the Model on Simple Linear regression without using KFold cv with GridSearch Cv

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.linear_model import Ridge

# Initialize the model
model = Ridge()

# Train the model on the training data
model.fit(X_train, y_train)


In [10]:
# Predict the target values for X_test
y_pred = model.predict(X_test)

# Display the first 5 predicted values
print(y_pred[:5])


[1.14799876 1.52314375 1.86371711 2.89583825 1.97523754]


In [37]:
from sklearn.metrics import r2_score

# Calculate R² score
r2 = r2_score(y_test, y_pred)

# Print the result
print("R² Score:", r2)


# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Print the results
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

R² Score: 0.4728616138868439
Mean Squared Error (MSE): 0.6907668739380978
Mean Absolute Error (MAE): 0.6201296900875063


# Training the Model on Simple Linear regression by using KFold cv with GridSearch Cv

In [13]:
from sklearn.model_selection import KFold

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [14]:
param_grid = {"alpha": [0.1, 1, 10, 100, 1000]}

In [15]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {"alpha": [0.1, 1, 10, 100, 1000]}

# Initialize Ridge Regression model
ridge = Ridge()

# Use GridSearchCV
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring="r2")

# Train model on dataset (X_train, y_train)
grid_search.fit(X_train, y_train)

# Get best alpha
print("Best alpha:", grid_search.best_params_["alpha"])


Best alpha: 1000


In [16]:
# Now train the Ridge model with the best alpha value (alpha = 10 for this example)
ridge_best = Ridge(alpha=10)

In [17]:
# Fit the model on the training data
ridge_best.fit(X_train, y_train)

# Predict the new values (y_new_pred) using the trained model
y_new_pred = ridge_best.predict(X_test)

# Print or use the predictions as needed
print("Predictions (y_new_pred):", y_new_pred)

Predictions (y_new_pred): [1.14813333 1.52323725 1.86370612 ... 4.23116835 1.56345176 2.0347519 ]


In [41]:
from sklearn.metrics import r2_score

# Calculate the R² score for the predictions
r2_new = r2_score(y_test, y_new_pred)

# Print the R² score
print("R² score for the predictions:", r2_new)


R² score for the predictions: 0.4728659369462851


In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate Mean Squared Error (MSE)
mse_new = mean_squared_error(y_test, y_new_pred)

# Calculate Mean Absolute Error (MAE)
mae_new = mean_absolute_error(y_test, y_new_pred)

# Print the results
print("Mean Squared Error (MSE):", mse_new)
print("Mean Absolute Error (MAE):", mse_new)


Mean Squared Error (MSE): 0.6907612089621926
Mean Absolute Error (MAE): 0.6907612089621926


In [43]:
print("Evaluation Metrices before cv and hyperparameter tunning:")
print(f"r2_old: {r2} , mse_old : {mse} , mae_old : {mae}")

print("Evaluation Metrices after cv and hyperparameter tunning:")
print(f"r2_new: {r2_new} , mse_new : {mse_new} , mae_new : {mae_new}")


Evaluation Metrices before cv and hyperparameter tunning:
r2_old: 0.4728616138868439 , mse_old : 0.6907668739380978 , mae_old : 0.6201296900875063
Evaluation Metrices after cv and hyperparameter tunning:
r2_new: 0.4728659369462851 , mse_new : 0.6907612089621926 , mae_new : 0.6201383668852641


### Insights :
- Though the improvement in the performance of the model before and after Hyperameter tuning is not large enough we can still confirm that
  hyperparameter tuning with cross-validation enhances the performance of the model.
- Maybe after feature engineering and training the model on some better algo than Ridge can show us the significant improvement in the
  performance of the model when compared the results before and after hyperparameter tunning.
- #### The main objective of this practice session was to determine how the potentials of cross-validation and GridsearchCv differ and how they compliment to each other at the same time 

# Thankyou