In [1]:
from pathlib import Path
import tarfile
import urllib.request
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
# loading the data

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [3]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
# creating income_cat as a new feature based on the median_income
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [5]:
# using stratified split to create the training and test sets
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [6]:
# creating the target value and removing it from the input features 
housing_labels = strat_train_set['median_house_value'].copy()
housing = strat_train_set.drop(['median_house_value', 'income_cat'], axis=1) 

In [7]:
#dummifing the categorical variable  
housing_cat = housing[["ocean_proximity"]].copy()
housing_num = housing.drop("ocean_proximity", axis=1)

In [8]:
# preparing the numeric and the categorical features
num_features = list(housing_num)
cat_features = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")), # or "most_frequent"
        ('std_scaler', StandardScaler())
    ])

preprocessing = ColumnTransformer([
        ("num", num_pipeline, num_features),
        ("cat", OneHotEncoder(), cat_features)
    ])

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    # creating a Random Forest Regressor model as part of the pipeline
    ("random_forest", RandomForestRegressor(random_state=42))])

# setting the parameters grid
param_grid = [
    {'random_forest__n_estimators': [5, 15, 25],
     'random_forest__max_features': [2, 4, 8]}]

# using GridSearchCV to apply 3-fold cv
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring="neg_root_mean_squared_error")

# fitting the grid to find the best parameters
grid_search.fit(housing, housing_labels)

In [10]:
# getting the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [11]:
# printing the best parameters
print (best_params)

{'random_forest__max_features': 8, 'random_forest__n_estimators': 25}


We observe that both the maximum number of features and the number of estimators of the best model take values equal to the upper limit of the values considered during the grid search. This result suggests that it may be beneficial to increase the upper limits of the hyperparameters and rerun the calculation in order to potentially achieve better results. Therefore, we can conclude that the current results are unsatisfactory.

In [12]:
feature_importances = best_model['random_forest'].feature_importances_

sorted(zip(feature_importances,
           best_model['preprocessing'].get_feature_names_out()),
           reverse=True)

[(0.4489278765772465, 'num__median_income'),
 (0.15151036649347321, 'cat__ocean_proximity_INLAND'),
 (0.11154161058887384, 'num__longitude'),
 (0.10426574632208004, 'num__latitude'),
 (0.04898906291051706, 'num__housing_median_age'),
 (0.03704180760688559, 'num__population'),
 (0.029489682794407848, 'num__total_rooms'),
 (0.026596324288719265, 'num__total_bedrooms'),
 (0.023997826946791284, 'num__households'),
 (0.00947643488128343, 'cat__ocean_proximity_<1H OCEAN'),
 (0.006048080507893172, 'cat__ocean_proximity_NEAR OCEAN'),
 (0.001758384616827353, 'cat__ocean_proximity_NEAR BAY'),
 (0.00035679546500151855, 'cat__ocean_proximity_ISLAND')]

Based on our analysis and the best parameters, we conclude that the three most important features are median income, INLAND, and longitude.