## Exercício

In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import expon, reciprocal, uniform
from sklearn. svm import LinearSVR, SVR, NuSVR
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
housing = pd.read_csv('dataset/housing/housing.csv')

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing['income_cat'] = pd.cut(housing['median_income'],
                          bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                          labels=[1, 2, 3, 4, 5])

In [5]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42, shuffle=True, stratify=housing['income_cat'])

In [6]:
train_set = train_set.drop('income_cat', axis=1)
test_set = test_set.drop('income_cat', axis=1)

In [7]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

In [8]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, rooms_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [9]:
x_train = train_set.drop('median_house_value', axis=1)
y_train = train_set['median_house_value'].copy()

In [10]:
x_test = test_set.drop('median_house_value', axis=1)
y_test = test_set['median_house_value'].copy()

In [11]:
x_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN


In [12]:
numeric_attributes = list(x_train.drop('ocean_proximity', axis=1))
categorical_attributes = ['ocean_proximity']

In [13]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('combined_attributes', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [14]:
prepare_columns_pipeline = ColumnTransformer([
    ('numeric', numeric_pipeline, numeric_attributes),
    ('categoric', OneHotEncoder(), categorical_attributes),
])

### 1.

In [15]:
param_grid = [
    {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000.], 'epsilon': [0.1, 0.01]},
    {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300.], 'epsilon': [0.1, 0.01],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]

In [16]:
svr = SVR()
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, verbose=2)

In [17]:
x = prepare_columns_pipeline.fit_transform(x_train)

In [None]:
grid_search.fit(x, y_train)

In [None]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

np.float64(70671.40921977101)

In [None]:
grid_search.best_params_

{'C': 300.0, 'kernel': 'linear'}

### 2.

In [None]:
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'epsilon': uniform(0.01, 1),
        'gamma': expon(scale=1.0),
    }

In [None]:
svr = SVR()
random_search = RandomizedSearchCV(svr, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, random_state=42)

In [None]:
random_search.fit(x, y_train)

KeyboardInterrupt: 

In [None]:
negative_mse = random_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

In [None]:
random_search.best_params_

### 3.

### 4.

### 5.