### Chapter 2 Problems
This notebook attempts to solve the problems in Chapter 2 of "Aurélien Géron - Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow_ Concepts, Tools, and Techniques to Build Intelligent Systems-O'Reilly Media (2022)".

**Question 1**

Try a support vector machine regressor(sklearn.svm.SVR) with various hyperparameters, such as kernel = "linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Note that support vector machines don’t scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross- validation, or else it will take hours. Don’t worry about what the hyperparameters mean for now; we’ll discuss them in Chapter 5. How does the best SVR predictor perform?

In [15]:
#Import Chapter 2 dependencies

from pathlib import Path
import pandas as pd
import tarfile
import matplotlib.pyplot as plt
import urllib.request
import numpy as np
from zlib import crc32
import sklearn
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from scipy import stats
from scipy.stats import randint
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [2]:
#load the data
def loadHousingData():
    #a tarball is an archive file (.tgz extension)
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok= True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = loadHousingData()

Data Exploration was performed in Chapter 2.

## Create a Good Test Set

In [3]:
#create categories for the target value so that we can create a stratified test sample
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins = [0,1.5,3,4.5,6,np.inf],
                               labels=[1,2,3,4,5])

#create stratified test sample
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2,
                                               stratify=housing["income_cat"], random_state=42)

#now get rid of the 'income cat' category
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)


### Copy the training set, separate the labels

In [12]:
housing = strat_train_set.drop("median_house_value", axis= 1)
housing_labels = strat_train_set["median_house_value"].copy() #need to copy since otherwise it is just a reference to the original!

## Preprocessing

### Creating the Column Transformer

In [13]:
"""
Store the numerical and categorical column names in an array.

For numerical data, impute missing values with the median and then standardize values.
For categorical data, impute missing values with the mode and then convert into one-hot representation.""" 

num_columns = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]
cat_columns = ["ocean_proximity"]

default_num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler())
])

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("binary_rep", OneHotEncoder(handle_unknown="ignore"))
])

#create a pipeline for log transforms
log_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy='median')),
    ("log_transform", FunctionTransformer(np.log, feature_names_out='one-to-one')),
    ("scaling", StandardScaler())
])

#create a pipeline for ratio data

#first function takes in an n x 2 pandas dataframe and computes a column of ratios
def column_ratio(X):
    return X[:,[0]]/X[:,[1]]

#function should add ratio to the name
def ratio_name(functionTransformer, featute_names_in):
    #will append __ratio after the name in the triplet
    return ["ratio"]

#include use of function transformner
ratio_pipeline = Pipeline([
    ("impute", SimpleImputer()),
    ("transform", FunctionTransformer(column_ratio, feature_names_out=ratio_name)),
    ("scale", StandardScaler())
])

#create clustering algorithm

#to do this, create custom clustering class that inherits from BaseEstimator and TransformerMixin, and includes fit and transform methods.
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters 
        self.gamma = gamma 
        self.random_state = random_state
        
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state) 
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

#instantiate
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1, random_state=42)

#create the preprocessing column transformer
#remember that transformation pipelines fit transform every transformer until the final step where it performs the method called
preprocessing = ColumnTransformer([
    ('bedrooms', ratio_pipeline, ["total_bedrooms", "total_rooms"]),
    ('rooms_per_house', ratio_pipeline, ["total_rooms", "households"]),
    ('people_per_house', ratio_pipeline, ["population", "households"]),
    ('log', log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ('geo', cluster_simil, ['longitude', 'latitude']),
    ('cat', cat_pipeline, cat_columns),
], remainder= default_num_pipeline)

## Training the model

In [19]:
# Create the model transformer pipeline by combining the preprocessing pipeline with the model

svm_reg = Pipeline([
    ("preprocessing",preprocessing),
    ("svr", SVR())
    ])

In [33]:
#create hyperparameter grid

param_grid = [{"preprocessing__geo__n_clusters":[5,10],
               "svr__kernel": ["linear", "rbf"],
               "svr__C": [1,10,100,1000,10000]}]

In [34]:
#now train on first 5k instances and use 3-fold cv. this requires GridSearchCV

grid_search = GridSearchCV(svm_reg,
                           param_grid=param_grid,
                           cv=3,
                           scoring='neg_root_mean_squared_error')

#iloc is optimized
grid_search.fit(housing.iloc[:5000],housing_labels.iloc[:5000])

  grid_search.fit(housing[:5000],housing_labels[:5000])
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()

In [35]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_svr__C,param_svr__kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.720689,0.043351,0.206734,0.004678,5,1,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-118546.506102,-120612.100558,-114547.0792,-117901.895287,2517.639549,16
1,0.90197,0.02923,0.88833,0.024317,5,1,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-120339.81586,-122569.575159,-116196.997506,-119702.129508,2640.38121,19
2,0.718304,0.032916,0.199409,0.002792,5,10,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-106487.318722,-107283.172653,-102174.870629,-105315.120668,2244.136597,12
3,0.86917,0.004986,0.853899,0.018301,5,10,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-119494.96809,-121700.848319,-115398.29459,-118864.703666,2611.317779,17
4,0.65296,0.008921,0.192631,0.000193,5,100,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-81529.077923,-81496.842335,-77581.59176,-80202.504006,1853.311547,8
5,0.859666,0.005604,0.838094,0.002779,5,100,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-112336.386939,-114397.921108,-107981.616627,-111571.974892,2674.632078,13
6,0.660335,0.005671,0.193501,0.000918,5,1000,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-75467.881715,-74514.852363,-70737.884562,-73573.539547,2042.509997,6
7,0.850783,0.000478,0.831211,0.000495,5,1000,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-84213.898929,-84557.9401,-79311.851508,-82694.563512,2396.058772,10
8,0.852724,0.086674,0.191892,6e-05,5,10000,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-76155.133303,-72101.723724,-68144.923634,-72133.926887,3270.233684,4
9,0.866325,0.00423,0.8347,0.002061,5,10000,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-63494.134591,-61471.322153,-59843.026014,-61602.827586,1493.456555,2


In [36]:
grid_search.best_params_

{'preprocessing__geo__n_clusters': 10, 'svr__C': 10000, 'svr__kernel': 'rbf'}