#Try SVR on housing dataset with diff params and kernels

In [7]:
#get the data

import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + '/housing.tgz'


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  if not os.path.isdir(housing_path):
    os.makedirs(housing_path)
  tgz_path = os.path.join(housing_path, 'housing.tgz')
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()

#read to pd dataframe
import pandas as pd

def read_data(path):
  return pd.read_csv(path+'/housing.csv')

fetch_housing_data()
housing = read_data(HOUSING_PATH)
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [13]:
#Stratified train test split based on median income
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

housing['income_cat'] = pd.cut(housing['median_income'], bins=[0.0, 1.5, 3, 4.5, 6, np.inf],
                               labels=[1, 2, 3, 4, 5])
housing['income_cat'].value_counts()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_id, test_id in split.split(housing, housing['income_cat']):
  train_stf, test_stf = housing.iloc[train_id], housing.iloc[test_id]

print(train_stf.count())
print(test_stf.count())

train_stf.drop(['income_cat'], axis=1, inplace=True)
test_stf.drop(['income_cat'], axis=1, inplace=True)

longitude             16512
latitude              16512
housing_median_age    16512
total_rooms           16512
total_bedrooms        16354
population            16512
households            16512
median_income         16512
median_house_value    16512
ocean_proximity       16512
income_cat            16512
dtype: int64
longitude             4128
latitude              4128
housing_median_age    4128
total_rooms           4128
total_bedrooms        4079
population            4128
households            4128
median_income         4128
median_house_value    4128
ocean_proximity       4128
income_cat            4128
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [14]:
#Separate attr and labels
housing_x = train_stf.drop(['median_house_value'], axis=1)
housing_y = train_stf['median_house_value'].copy()

In [36]:
#Create new features
from sklearn.base import BaseEstimator, TransformerMixin


class ExtraAttributes(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room=True, extra_attr_id=[]):
    self.add_bedrooms_per_room = add_bedrooms_per_room
    self.extra_attr_id = extra_attr_id
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):

    rooms_per_hholds = X[:, self.extra_attr_id[0]] / X[:, self.extra_attr_id[3]]
    pop_per_hholds = X[:, self.extra_attr_id[2]] / X[:, self.extra_attr_id[3]]
    if self.add_bedrooms_per_room:
      brooms_per_room = X[:, self.extra_attr_id[1]] / X[:, self.extra_attr_id[0]]
      return np.c_[X, rooms_per_hholds, pop_per_hholds, brooms_per_room]
    else:
      return np.c_[X, rooms_per_hholds, pop_per_hholds]


# add_attr = ExtraAttributes(add_bedrooms_per_room=False)
# housing_extra_attr = add_attr.transform(housing_x.values)

# housing_extra_attr = pd.DataFrame(housing_extra_attr,
#                                   columns=list(housing_x.columns)+['rooms_per_hholds',
#                                                              'pop_per_hholds'],
#                                   index=housing_x.index)
# housing_extra_attr.head()




In [37]:

#Categorical and numerical features
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

cat_attr = ['ocean_proximity']
num_attr = housing_x.drop(cat_attr, axis=1).columns

#Create pipeline for numerical features: Impute, Add attributes, scale

#Create new features from these features
extra_attr_from = ['total_rooms', 'total_bedrooms', 'population', 'households']
rooms_id, brooms_id, pop_id, hholds_id = [housing.columns.get_loc(c) 
                                              for c in extra_attr_from]

num_pipeline = Pipeline([
                         ('imputer', SimpleImputer(strategy='median')),
                         ('attr_add', ExtraAttributes(add_bedrooms_per_room=False,
                                                      extra_attr_id=[rooms_id, 
                                                                     brooms_id, 
                                                                     pop_id,
                                                                     hholds_id])),
                          ('std_scaler', StandardScaler()),
                        ])

housing_num_tr = num_pipeline.fit_transform(housing_x[num_attr])

In [40]:
#Full pipeline for numerical and categorical features
#Use ColumnTransformer to apply different transformers to different attributes

from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
                                   ('num', num_pipeline, num_attr),
                                   ('cat', OneHotEncoder(), cat_attr)
                                  ])

housing_prep = full_pipeline.fit_transform(housing_x)

In [49]:
#Grid search the params with SVR

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = {'kernel': ['linear', 'rbf'],
              'C': [0.01, 1, 100],
              'gamma': [0.1, 1, 10]}

sv_reg = SVR()
grid_search = GridSearchCV(sv_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prep, housing_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 1, 100], 'gamma': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

In [50]:
grid_search.best_params_

{'C': 100, 'gamma': 0.1, 'kernel': 'linear'}

In [53]:
grid_search.best_estimator_

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [57]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
  print(np.sqrt(-mean_score), params)

  

118855.96291060038 {'C': 0.01, 'gamma': 0.1, 'kernel': 'linear'}
118920.06571320626 {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
118855.96291060038 {'C': 0.01, 'gamma': 1, 'kernel': 'linear'}
118922.62925681463 {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
118855.96291060038 {'C': 0.01, 'gamma': 10, 'kernel': 'linear'}
118923.15610877576 {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
112978.05831219374 {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
118635.66292102466 {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
112978.05831219374 {'C': 1, 'gamma': 1, 'kernel': 'linear'}
118877.21824455929 {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
112978.05831219374 {'C': 1, 'gamma': 10, 'kernel': 'linear'}
118922.10864227857 {'C': 1, 'gamma': 10, 'kernel': 'rbf'}
72038.42723220418 {'C': 100, 'gamma': 0.1, 'kernel': 'linear'}
98014.54557303373 {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
72038.42723220418 {'C': 100, 'gamma': 1, 'kernel': 'linear'}
115016.341977547 {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
72038.42723220418 {'C': 100

In [58]:
final_model = grid_search.best_estimator_

In [63]:
#Full pipeline

feat_train_predict_pipe = Pipeline([
                                    ('prep', full_pipeline),
                                    ('svm_reg', SVR(**grid_search.best_params_))
])

feat_train_predict_pipe.fit(housing_x, housing_y)

Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                                     

In [65]:
from sklearn.metrics import mean_squared_error

pred = feat_train_predict_pipe.predict(housing_x[:5])
svr_mse = mean_squared_error(housing_y[:5], pred)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

58111.47784697697