In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

# Load the California housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Feature engineering: Add polynomial and interaction features
poly_transformer = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly_transformer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Define a pipeline with feature scaling, polynomial feature generation, and a gradient boosting regressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('gbr', GradientBoostingRegressor(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'gbr__n_estimators': [100, 125, 150, 200],
    'gbr__max_depth': [3, 4],
    'gbr__learning_rate': [0.1, 0.05]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters:", grid_search.best_params_)

# Use the best model
best_model = grid_search.best_estimator_

# Make predictions on the testing set
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

# Print the metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r_squared)


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
# Load the California housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [None]:
# Feature engineering: Add polynomial and interaction features
poly_transformer = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly_transformer.fit_transform(X)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [None]:
# Define a pipeline with feature scaling, polynomial feature generation, and a gradient boosting regressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('gbr', GradientBoostingRegressor(random_state=42))
])

In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'gbr__n_estimators': [100, 120],
    'gbr__max_depth': [3, 4],
    'gbr__learning_rate': [0.1, 0.05]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

The GridSearchCV function in the code will run a grid search to evaluate different combinations of parameters specified in the param_grid. The total number of loops or iterations the grid search will run is determined by the number of combinations of parameters and the number of folds used for cross-validation (cv).

In the param_grid, there are:

2 options for 'gbr__n_estimators'
2 options for 'gbr__max_depth'
2 options for 'gbr__learning_rate'
This results in a total of 2 * 2 * 2 = 8 combinations of parameters.

Since the cross-validation (cv) is set to 5, each combination of parameters will be evaluated 5 times (once for each fold).

Therefore, the total number of loops or model fits that will be run is 8 combinations * 5 folds = 40 loops.

In [None]:
# Print the best parameters
print("Best parameters:", grid_search.best_params_)

In [None]:
# Use the best model
best_model = grid_search.best_estimator_


In [None]:
# Make predictions on the testing set
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

In [None]:
# Print the metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r_squared)