## Chapter 2 Problems
This notebook attempts to solve the problems in Chapter 2 of "Aurélien Géron - Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow_ Concepts, Tools, and Techniques to Build Intelligent Systems-O'Reilly Media (2022)".

### Question 1

Try a support vector machine regressor(sklearn.svm.SVR) with various hyperparameters, such as kernel = "linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Note that support vector machines don’t scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross- validation, or else it will take hours. Don’t worry about what the hyperparameters mean for now; we’ll discuss them in Chapter 5. How does the best SVR predictor perform?

In [2]:
#Import Chapter 2 dependencies

from pathlib import Path
import pandas as pd
import tarfile
import matplotlib.pyplot as plt
import urllib.request
import numpy as np
from zlib import crc32
import sklearn
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from scipy import stats
from scipy.stats import randint, loguniform
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsRegressor

In [3]:
#load the data
def loadHousingData():
    #a tarball is an archive file (.tgz extension)
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok= True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = loadHousingData()

Data Exploration was performed in Chapter 2.

## Create a Good Test Set

In [4]:
#create categories for the target value so that we can create a stratified test sample
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins = [0,1.5,3,4.5,6,np.inf],
                               labels=[1,2,3,4,5])

#create stratified test sample
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2,
                                               stratify=housing["income_cat"], random_state=42)

#now get rid of the 'income cat' category
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)


### Copy the training set, separate the labels

In [5]:
housing = strat_train_set.drop("median_house_value", axis= 1)
housing_labels = strat_train_set["median_house_value"].copy() #need to copy since otherwise it is just a reference to the original!

## Preprocessing

### Creating the Column Transformer

In [6]:
"""
Store the numerical and categorical column names in an array.

For numerical data, impute missing values with the median and then standardize values.
For categorical data, impute missing values with the mode and then convert into one-hot representation.""" 

num_columns = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]
cat_columns = ["ocean_proximity"]

default_num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler())
])

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("binary_rep", OneHotEncoder(handle_unknown="ignore"))
])

#create a pipeline for log transforms
log_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy='median')),
    ("log_transform", FunctionTransformer(np.log, feature_names_out='one-to-one')),
    ("scaling", StandardScaler())
])

#create a pipeline for ratio data

#first function takes in an n x 2 pandas dataframe and computes a column of ratios
def column_ratio(X):
    return X[:,[0]]/X[:,[1]]

#function should add ratio to the name
def ratio_name(functionTransformer, featute_names_in):
    #will append __ratio after the name in the triplet
    return ["ratio"]

#include use of function transformner
ratio_pipeline = Pipeline([
    ("impute", SimpleImputer()),
    ("transform", FunctionTransformer(column_ratio, feature_names_out=ratio_name)),
    ("scale", StandardScaler())
])

#create clustering algorithm

#to do this, create custom clustering class that inherits from BaseEstimator and TransformerMixin, and includes fit and transform methods.
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters 
        self.gamma = gamma 
        self.random_state = random_state
        
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state) 
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

#instantiate
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1, random_state=42)

#create the preprocessing column transformer
#remember that transformation pipelines fit transform every transformer until the final step where it performs the method called
preprocessing = ColumnTransformer([
    ('bedrooms', ratio_pipeline, ["total_bedrooms", "total_rooms"]),
    ('rooms_per_house', ratio_pipeline, ["total_rooms", "households"]),
    ('people_per_house', ratio_pipeline, ["population", "households"]),
    ('log', log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ('geo', cluster_simil, ['longitude', 'latitude']),
    ('cat', cat_pipeline, cat_columns),
], remainder= default_num_pipeline)

## Training the model

In [6]:
# Create the model transformer pipeline by combining the preprocessing pipeline with the model

svm_reg = Pipeline([
    ("preprocessing",preprocessing),
    ("svr", SVR())
    ])

In [7]:
#create hyperparameter grid

param_grid = [{"preprocessing__geo__n_clusters":[5,10],
               "svr__kernel": ["linear", "rbf"],
               "svr__C": [1,10,100,1000,10000]}]

In [8]:
#now train on first 5k instances and use 3-fold cv. this requires GridSearchCV

grid_search = GridSearchCV(svm_reg,
                           param_grid=param_grid,
                           cv=3,
                           scoring='neg_root_mean_squared_error')

#iloc is optimized
grid_search.fit(housing.iloc[:5000],housing_labels.iloc[:5000])

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

In [9]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_svr__C,param_svr__kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.847599,0.086908,0.21963,0.013494,5,1,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-118546.506102,-120612.100558,-114547.0792,-117901.895287,2517.639549,16
1,0.962206,0.033072,0.915518,0.028414,5,1,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-120339.81586,-122569.575159,-116196.997506,-119702.129508,2640.38121,19
2,0.963136,0.145002,0.285903,0.04786,5,10,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-106487.318722,-107283.172653,-102174.870629,-105315.120668,2244.136597,12
3,0.955092,0.054588,0.874542,0.030368,5,10,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-119494.96809,-121700.848319,-115398.29459,-118864.703666,2611.317779,17
4,0.669746,0.023526,0.197281,0.005325,5,100,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-81529.077923,-81496.842335,-77581.59176,-80202.504006,1853.311547,8
5,0.875185,0.013786,0.83446,0.005037,5,100,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-112336.386939,-114397.921108,-107981.616627,-111571.974892,2674.632078,13
6,0.68394,0.024404,0.197872,0.000416,5,1000,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-75467.881715,-74514.852363,-70737.884562,-73573.539547,2042.509997,6
7,0.887338,0.003966,0.877463,0.034791,5,1000,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-84213.898929,-84557.9401,-79311.851508,-82694.563512,2396.058772,10
8,0.877776,0.108995,0.202789,0.001075,5,10000,linear,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-76155.133303,-72101.723724,-68144.923634,-72133.926887,3270.233684,4
9,0.884113,0.019446,0.839139,0.013645,5,10000,rbf,"{'preprocessing__geo__n_clusters': 5, 'svr__C'...",-63494.134591,-61471.322153,-59843.026014,-61602.827586,1493.456555,2


In [10]:
print(grid_search.best_params_) #best hyperparameter of c is the highest value so you want to try even higher values
print(-grid_search.best_score_) #worse than random forest


{'preprocessing__geo__n_clusters': 10, 'svr__C': 10000, 'svr__kernel': 'rbf'}
60967.19463699844


### Question 2. Change the Grid Search to a Randomised Search

In [11]:
#first, create the parameter distributions and then RandomizedSearchCV class
param_distribs = [{"preprocessing__geo__n_clusters":randint(low=5,high=10),
               "svr__kernel": ["linear", "rbf"],
               "svr__C": randint(low=1, high=100000)}]

random_search = RandomizedSearchCV(svm_reg,
                                   param_distributions=param_distribs,
                                   cv=3,
                                   n_iter=10,
                                   scoring='neg_root_mean_squared_error',
                                   random_state=42)

In [12]:
#fit/train
random_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

In [13]:
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_svr__C,param_svr__kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.852358,0.137161,0.235708,0.038654,8,861,linear,"{'preprocessing__geo__n_clusters': 8, 'svr__C'...",-75074.376267,-74420.38107,-70702.74377,-73399.167036,1925.262103,10
1,2.020122,0.217794,0.192976,0.011222,7,76821,linear,"{'preprocessing__geo__n_clusters': 7, 'svr__C'...",-80211.586479,-70942.299835,-66359.09864,-72504.328318,5762.106361,8
2,1.937011,0.192861,0.199507,0.005078,6,82387,linear,"{'preprocessing__geo__n_clusters': 6, 'svr__C'...",-79400.92776,-71399.884631,-66882.266869,-72561.02642,5176.253814,9
3,1.14844,0.152642,0.863562,0.04045,7,87499,rbf,"{'preprocessing__geo__n_clusters': 7, 'svr__C'...",-58016.449007,-55902.631714,-54849.184327,-56256.08835,1316.963701,2
4,0.97018,0.008085,0.863606,0.007759,9,44132,rbf,"{'preprocessing__geo__n_clusters': 9, 'svr__C'...",-58307.228408,-56376.618902,-55346.865317,-56676.904209,1227.073988,3
5,1.810604,0.189035,0.183628,0.000339,7,67222,linear,"{'preprocessing__geo__n_clusters': 7, 'svr__C'...",-79509.787345,-70937.77189,-66388.779821,-72278.779685,5439.910296,7
6,0.960619,0.007305,0.867125,0.031616,6,59736,rbf,"{'preprocessing__geo__n_clusters': 6, 'svr__C'...",-58645.380071,-56730.183996,-55490.132601,-56955.232222,1297.91671,4
7,0.969514,0.061324,0.948053,0.096627,6,5312,rbf,"{'preprocessing__geo__n_clusters': 6, 'svr__C'...",-67335.82148,-65665.772607,-62926.681191,-65309.425093,1817.574792,5
8,1.276362,0.123114,0.969084,0.066249,9,83105,rbf,"{'preprocessing__geo__n_clusters': 9, 'svr__C'...",-57587.820616,-55575.65152,-54345.21317,-55836.228436,1336.550538,1
9,1.190626,0.01323,0.201494,0.002904,6,28694,linear,"{'preprocessing__geo__n_clusters': 6, 'svr__C'...",-78140.743461,-71432.247547,-67204.103837,-72259.031615,4502.976829,6


### Question 3: SelectFromModel transformer
Add a SelectFromModel transformer in the preparation pipeline to select only the most important attributes.

In [14]:
svm_reg_selected = Pipeline([
    ("preprocessing", preprocessing),
    ("selector", SelectFromModel(RandomForestRegressor(random_state=42,), threshold=0.005)), #base the feature importance off a random forest regressor
    ("svr", SVR(C=random_search.best_params_["svr__C"],
                kernel=random_search.best_params_["svr__kernel"]))
    ])

In [15]:
#now find the cv score using this pipeline
selector_rmses = -cross_val_score(svm_reg_selected,
                                  housing.iloc[:5000],
                                  housing_labels.iloc[:5000],
                                  cv=3,
                                  scoring='neg_root_mean_squared_error')
pd.Series(selector_rmses).describe()
#seems like the selector did not significantly help

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


count        3.000000
mean     56047.769764
std       1448.908317
min      54667.291034
25%      55293.368954
50%      55919.446875
75%      56738.009130
max      57556.571385
dtype: float64

### Question 4: Custom Transformer with k-Nearest Neighbours
- Create a custom transformer that trains a k-nearest neighbors regressor (sklearn.neighbors.KNeighborsRegressor) in its fit() method, and outputs the model’s predictions in its transform() method. 
- Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts.

In [16]:
class KNNSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbours=5, weights = "distance"):
        self.n_neighbours = n_neighbours
        self.weights = weights

    # create a knn property that fits to the data
    def fit(self, X, y=None):
        self.knn = KNeighborsRegressor(n_neighbors=self.n_neighbours, weights=self.weights)
        self.knn.fit(X, y)
        return self

    # output the model's predictions
    def transform(self, X):
        return pd.DataFrame(self.knn.predict(X))

knn_simil = KNNSimilarity()

In [17]:
#by adding the knn_simil class to the transformer and making it act on longitude and latitude, the model will account for the median housing price of the nearest districts
preprocessing_q4 = ColumnTransformer([
    ('bedrooms', ratio_pipeline, ["total_bedrooms", "total_rooms"]),
    ('rooms_per_house', ratio_pipeline, ["total_rooms", "households"]),
    ('people_per_house', ratio_pipeline, ["population", "households"]),
    ('log', log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ('geo', cluster_simil, ['longitude', 'latitude']),
    ('knn', knn_simil, ['longitude', 'latitude']),
    ('cat', cat_pipeline, cat_columns),
], remainder= default_num_pipeline)

In [18]:
#new model pipeline
#potentially use sklearn clone to recreate processing pipeline with unfitted estimators

svr_knn = Pipeline([
    ('preprocessing', preprocessing_q4),
    ('svr', SVR(C=random_search.best_params_["svr__C"],
                kernel=random_search.best_params_["svr__kernel"]))
])

In [20]:
#check error with this new feature
svr_knn_rmses = -cross_val_score(svr_knn,
                                 X=housing.iloc[:5000],
                                 y=housing_labels.iloc[:5000],
                                 cv=3,
                                 scoring='neg_root_mean_squared_error',
                                 )
pd.DataFrame(svr_knn_rmses).describe()
#scores appear to be notably worse when adding in knn regression

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,0
count,3.0
mean,59758.355925
std,1204.688343
min,58849.225002
25%,59075.17169
50%,59301.118378
75%,60212.921387
max,61124.724395


### Question 5: Automatically explore preparation options using RandomizedSearchCV

In [23]:
# check different combinations of hyperparameters to see if the knn performance improves

knn_param_distribs = [{"preprocessing__knn__n_neighbours":randint(low=1, high=30),
                    "svr__C": loguniform(20, 200000)}]

knn_ran_search = RandomizedSearchCV(svr_knn,
                               knn_param_distribs,
                               scoring='neg_root_mean_squared_error',
                               n_iter=5,
                               cv=3)

In [24]:
knn_ran_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [25]:
knn_ran_search.best_params_

{'preprocessing__knn__n_neighbours': 8, 'svr__C': 14176.39288400731}

### Question 6: Implement the StandardScalerClone class from scratch

- Try to implement the Standard Scaler Clone class again from scratch.
- Then, add support for the inverse_transform() method: executing scaler.inverse_transform(scaler.fit_transform(X)) should return an array very close to X. 
- Then add support for feature names: set feature_names_in_ in the fit() method if the input is a DataFrame. This attribute should be a NumPy array of column names. 
- Lastly, implement the get_feature_names_out() method: it should have one optional input_features=None argument. If passed, the method should check that its length matches n_features_in_, and it should match feature_names_in_ if it is defined; then input_features should be returned. If input_features is None, then the method should either return feature_names_in_ if it is defined or np.array(["x0", "x1", ...]) with length n_features_in_ otherwise.

In [20]:
class StandardScalerClone(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self,X, y=None): #including y=None is a standard for estimator fit() methods, even when not used
        
        X = check_array(X) #check that X is an array with finite float values. check array is automatically numeric

        #pandas mean axis = 0  means over different indices. axis = 1 means over different columns
        self.mean_ = X.mean(axis=0)
        self.sd_ = X.std(axis=0)

        #support for feature_names_in_ if the input is a df. should be an np array of column names
        self.n_features_in_ = X.shape[1] #how many columns in the dataframe
        if hasattr(X, "columns"):
            self.feature_names_in_ = np.array(X.columns)
        return self

    def transform(self, X):
        check_is_fitted(self) #checks if the scaler is fitted by looking for fitted attributes (ending in an underscore)
        X = check_array(X)       
        X = (X - self.mean_)/(self.sd_)
        return X
    
    def inverse_transform(self, X):
        X = check_array(X)
        X = (X * self.sd_) + self.mean_
        return X
    
    def get_feature_names_out(self, input_features=None): 
        #is the input_features parameter passed?
        if input_features == None:
            #if feature_names_in_ is not defined then return the xn array
            return getattr(self, "feature_names_in_",
                np.array([f"x{i}" for i in range(self.n_features_in_)]))

        #check if feature_names_in_ is defined, check if number of col names is equal to no of input features
        if (self.feature_names_in_ != None) and len(input_features) == self.n_features_in: #instead of == None you can use the hasattr() function

            #check if feature_names_in_ is the same as the input features
            if (self.feature_names_in_ == input_features):
                return input_features

        #to prevent unnecessary nesting you can check for the opposite of the required condition and raise an error - "guarding"
        

In [21]:
#now we want to see if the estimator is valid according to the scikit learn API
from sklearn.utils.estimator_checks import check_estimator
check_estimator(StandardScalerClone())

In [26]:
np.random.seed(42)
X = np.random.rand(1000, 3)

scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)
X_scaled_unscaled = scaler.inverse_transform(X_scaled)