In [33]:
# Installing catboost
!pip install catboost



In [34]:
# Importing important libraries
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [35]:
# Loading the dataset
data = fetch_california_housing()
print(data.keys())


dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])


In [36]:
# DESCR
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [37]:
# Seperating training and testing data
X = data.data
y = data.target

In [38]:
# Performing train test split
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)


In [39]:
# Creating an empty dictionary for recording the results
results = {}

In [40]:
# Decision Tree
dt_param_grid = {'max_depth' : [10, None], 'min_samples_split' : [2, 5]}
dt_grid = GridSearchCV(DecisionTreeRegressor(random_state = 42), dt_param_grid,
                       cv = 2, scoring = 'neg_mean_squared_error')
dt_grid.fit(X_train, y_train)
dt_best = dt_grid.best_estimator_
y_pred_dt = dt_best.predict(X_test)
results['Decision Tree'] = metrics.mean_squared_error(y_test, y_pred_dt)
print(results)



{'Decision Tree': 0.4108816277037728}


In [41]:
# Random Forest
rf_param_grid = {'n_estimators' : [50], 'max_depth' :[10, None]}
rf_grid = GridSearchCV(RandomForestRegressor(random_state = 42), rf_param_grid,
                       cv = 2, scoring = 'neg_mean_squared_error')
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test)
results['Random Forest'] = metrics.mean_squared_error(y_test, y_pred_rf)
print(results)

{'Decision Tree': 0.4108816277037728, 'Random Forest': 0.2572979293772426}


In [42]:
# AdaBoost
ab_param_grid = {'n_estimators' : [50], 'learning_rate' : [0.1]}
ab_grid = GridSearchCV(AdaBoostRegressor(random_state = 42), ab_param_grid,
                       cv = 2, scoring = 'neg_mean_squared_error')
ab_grid.fit(X_train, y_train)
ab_best = ab_grid.best_estimator_
y_pred_ab = ab_best.predict(X_test)
results['AdaBoost'] = metrics.mean_squared_error(y_test, y_pred_ab)
print(results)

{'Decision Tree': 0.4108816277037728, 'Random Forest': 0.2572979293772426, 'AdaBoost': 0.56632113891916}


In [43]:
# XGBoost
xgb_param_grid = {'n_estimators' : [50], 'learning_rate' : [0.1], 'max_depth' : [3,5]}
xgb_grid = GridSearchCV(XGBRegressor(random_state = 42), xgb_param_grid,
                       cv = 2, scoring = 'neg_mean_squared_error')
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)
results['XGBoost'] = metrics.mean_squared_error(y_test, y_pred_xgb)
print(results)

{'Decision Tree': 0.4108816277037728, 'Random Forest': 0.2572979293772426, 'AdaBoost': 0.56632113891916, 'XGBoost': 0.27461538581034906}


In [44]:
# catBoost
cat_param_grid = {'n_estimators' : [100], 'learning_rate' : [0.1], 'max_depth' : [6]}
cat_grid = GridSearchCV(CatBoostRegressor(verbose = False, random_state = 42), cat_param_grid,
                       cv = 2, scoring = 'neg_mean_squared_error')
cat_grid.fit(X_train, y_train)
cat_best = cat_grid.best_estimator_
y_pred_cat = cat_best.predict(X_test)
results['catBoost'] = metrics.mean_squared_error(y_test, y_pred_cat)
print(results)

{'Decision Tree': 0.4108816277037728, 'Random Forest': 0.2572979293772426, 'AdaBoost': 0.56632113891916, 'XGBoost': 0.27461538581034906, 'catBoost': 0.266126308152799}


In [45]:
# # Compare the save best model
best_model_name = min(results, key=results.get)
print(f"Best Model: {best_model_name}")

Best Model: Random Forest


In [46]:
# Saving the best model
if best_model_name == 'Decision Tree':
  best_model = dt_best
elif best_model_name == 'Random Forest':
  best_model = rf_best
elif best_model_name == 'AdaBoost':
  best_model = ab_best
elif best_model_name == 'XGBoost':
  best_model = xgb_best
elif best_model_name == 'catBoost':
  best_model = cat_best

print(best_model)

RandomForestRegressor(n_estimators=50, random_state=42)


In [47]:
# Saving the best model in pickle file format
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)