In [1]:
from utils import load_housing_data
import numpy as np
import pandas as pd
import sklearn

KeyboardInterrupt: 

In [None]:
# Loading in the housing data from ageron github
df_housing = load_housing_data()
df_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# Performing stratified train test split
# Creating a new column called income category which we used to create stratified sets
df_housing["income_category"] = pd.cut(
    df_housing["median_income"], bins=[0, 1.5, 3.0, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5]
    )

In [None]:
from sklearn.model_selection import train_test_split

# Creating the test and train set that will have similar income category distributions as the overall dataset
strat_train_set, strat_test_set = train_test_split(
    df_housing, test_size=0.2, stratify=df_housing["income_category"], 
    random_state=42
)

In [None]:
# We can now drop the column we created just to help us stratify
for df in (strat_train_set, strat_test_set):
    df.drop("income_category", axis=1, inplace=True)

We will work using `strat_train_set` from this point onwards.

In [None]:
# Removing the label column from the training set
df_housing_strat_train = strat_train_set.drop("median_house_value", axis=1)

# Separating the label column in the training set
df_housing_strat_labels = strat_train_set["median_house_value"].copy()

# Pipeline
The pipeline to transform our data will do the following:
1. Missing values in numerical features will be imputed by replacing them with the median for that feature. 
2. Missing values in categorical features will be imputed by replacing them with the most frequent category for that feature.
3. A few ratio features will be computed and added to the training set: `bedrooms_ratio`, `rooms_per_house`, `people_per_house`. These should be better correlated with the `median_house_value` for each district.
4. A few cluster similarity features will also be added. These will likely be more useful to the mdoel than latitude and longitude.
5. Features with a long tail will be replaced by their logarithm, as most ML models prefer features with roughly uniform or Gaussian distributions.
6. All numerical features will be standardized, as most ML algorithms prefer when all features have roughly the same scale.


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.compose import ColumnTransformer, make_column_selector

#### Step 1 Above ####
def default_numeric_pipeline():
    return make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler()
)

#### Step 2 Above ####
def default_categorical_pipeline():
    return make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

#### Step 3 Above ####
def column_ratio(df):
    '''
    Parameters: df -> pandas.DataFrame with at least 2 columns
    Returns: pandas.Series that is the result of dividing every value in column 0 of df by column 1 of df
    '''
    return df[:, [0]] / df[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )

#### Step 4 Above ####
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"cluster_{i}_similarity" for i in range(self.n_clusters)]

def cluster_similarity():
    return ClusterSimilarity(
        n_clusters=10, gamma=1., random_state=42
    )

#### Step 5 Above ####
def log_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(np.log, feature_names_out="one-to-one"),
        StandardScaler()
    )

#### Putting it altogether to actually apply each of the pipelines above to specified columns in the dataset ####
preprocessing = ColumnTransformer([ # for each column, specify the name of the new column, the pipeline to be applied, and the columns to apply it on
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]), # divides total bedrooms by total rooms and returns it in a column called bedrooms ratio
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline(), ["total_bedrooms", "total_rooms", "population",
                             "households", "median_income"]),
    ("geographic", cluster_similarity(), ["latitude", "longitude"]),
    ("cateogorical", default_categorical_pipeline(), 
     make_column_selector(dtype_include=object)), # categorical pipeline on object columns
],
remainder=default_numeric_pipeline())



In [None]:
df_housing_strat_train_processed = preprocessing.fit_transform(
    df_housing_strat_train)
print(f"Housing data processed and returned as type: {type(df_housing_strat_train_processed)}")
print(f"It has the shape: {df_housing_strat_train_processed.shape}")
print(f"The feature names are: {preprocessing.get_feature_names_out()}")


Housing data processed and returned as type: <class 'numpy.ndarray'>
It has the shape: (16512, 24)
The feature names are: ['bedrooms__ratio' 'rooms_per_house__ratio' 'people_per_house__ratio'
 'log__total_bedrooms' 'log__total_rooms' 'log__population'
 'log__households' 'log__median_income' 'geographic__cluster_0_similarity'
 'geographic__cluster_1_similarity' 'geographic__cluster_2_similarity'
 'geographic__cluster_3_similarity' 'geographic__cluster_4_similarity'
 'geographic__cluster_5_similarity' 'geographic__cluster_6_similarity'
 'geographic__cluster_7_similarity' 'geographic__cluster_8_similarity'
 'geographic__cluster_9_similarity'
 'cateogorical__ocean_proximity_<1H OCEAN'
 'cateogorical__ocean_proximity_INLAND'
 'cateogorical__ocean_proximity_ISLAND'
 'cateogorical__ocean_proximity_NEAR BAY'
 'cateogorical__ocean_proximity_NEAR OCEAN'
 'remainder__housing_median_age']


In [None]:
# We decided on using RandomForestRegressor
# Now we want to use GridSearchCV to find the best hyperparameters

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42))
])