#Add a Transformer to select most important features

#Build a pipeline to do everything

#Try Random Search

In [1]:
#get the data

import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + '/housing.tgz'


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  if not os.path.isdir(housing_path):
    os.makedirs(housing_path)
  tgz_path = os.path.join(housing_path, 'housing.tgz')
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()

#read to pd dataframe
import pandas as pd

def read_data(path):
  return pd.read_csv(path+'/housing.csv')

fetch_housing_data()
housing = read_data(HOUSING_PATH)
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
#Train test split: Stratified

#Create income cat col for stratified split
import numpy as np

housing['income_cat'] = pd.cut(housing['median_income'], 
                               bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
                               labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_id, test_id in split.split(housing, housing['income_cat']):
  train_stf, test_stf = housing.iloc[train_id], housing.iloc[test_id]

train_stf.info()

train_stf.drop('income_cat', axis=1, inplace=True)
test_stf.drop('income_cat', axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           16512 non-null  float64 
 1   latitude            16512 non-null  float64 
 2   housing_median_age  16512 non-null  float64 
 3   total_rooms         16512 non-null  float64 
 4   total_bedrooms      16354 non-null  float64 
 5   population          16512 non-null  float64 
 6   households          16512 non-null  float64 
 7   median_income       16512 non-null  float64 
 8   median_house_value  16512 non-null  float64 
 9   ocean_proximity     16512 non-null  object  
 10  income_cat          16512 non-null  category
dtypes: category(1), float64(9), object(1)
memory usage: 1.4+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [114]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

housing_x = train_stf.drop('median_house_value', axis=1)
housing_y = train_stf['median_house_value'].copy()

#Cat and num attributes
num_attr = list(housing_x.drop('ocean_proximity', axis=1).columns)
cat_attr = ['ocean_proximity']

#Add extra attr

class ExtraAttributes(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_rooms=False, extra_attr_id=[]):
    self.add_bedrooms_per_rooms = add_bedrooms_per_rooms
    self.extra_attr_id = extra_attr_id
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    if len(extra_attr_id)==0:
      return X
    else:
      rooms_per_hholds = X[:, self.extra_attr_id[0]] / X[:, self.extra_attr_id[3]]
      pop_per_hholds = X[:, self.extra_attr_id[2]] / X[:, self.extra_attr_id[3]]
      if self.add_bedrooms_per_rooms:
        brooms_per_room = X[:, self.extra_attr_id[1]] / X[:, self.extra_attr_id[0]]
        return np.c_[X, rooms_per_hholds, pop_per_hholds, brooms_per_room]
      else:
        return np.c_[X, rooms_per_hholds, pop_per_hholds]


#Create new features from these features
extra_attr_from = ['total_rooms', 'total_bedrooms', 'population', 'households']
extra_attr_id = [housing.columns.get_loc(c) for c in extra_attr_from]

#Feature pipeline

num_pipeline = Pipeline([
                         ('impute', SimpleImputer(strategy='median')),
                         ('add_attr', ExtraAttributes(add_bedrooms_per_rooms=False,
                                                      extra_attr_id=extra_attr_id)),
                         ('scale', StandardScaler())
])

feat_pipeline = ColumnTransformer([
                                   ('num_attr', num_pipeline, num_attr),
                                   ('cat_attr', OneHotEncoder(), cat_attr),
])

#Full pipeline

full_pipeline = Pipeline([
                          ('attr', feat_pipeline),
                          ('rf', RandomForestRegressor(random_state=42))
])



In [110]:
#Feature selection with grid search

class TopFeatures(BaseEstimator, TransformerMixin):
  def __init__(self, feature_importances, k):
    self.feature_importances = feature_importances
    self.k = k
  
  def fit(self, X, y=None):
    self.feature_id_ = np.sort(np.argpartition(
                                            np.array(self.feature_importances), 
                                                      -self.k)[-self.k:])
    return self

  def transform(self, X):
    return X[:, self.feature_id_]

In [111]:
#Cal feature imp

#Use Random Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon

params_dist = {
                'rf__n_estimators': [int(x) for x in range(1, 101)]
}

rnd_search = RandomizedSearchCV(full_pipeline, params_dist, n_iter=10, cv=5,
                                scoring='neg_mean_squared_error', verbose=2,
                                random_state=42)

rnd_search.fit(housing_x, housing_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] rf__n_estimators=84 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .............................. rf__n_estimators=84, total=  10.5s
[CV] rf__n_estimators=84 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.5s remaining:    0.0s


KeyboardInterrupt: ignored

In [79]:
(final_model['rf']).feature_importances_

array([6.06205078e-02, 5.75541743e-02, 4.70011896e-02, 1.43912873e-02,
       1.39528617e-02, 1.31327674e-02, 1.34810780e-02, 4.76372365e-01,
       3.49851220e-02, 1.24112595e-01, 1.32808848e-03, 1.39853233e-01,
       8.87505352e-05, 8.25668855e-04, 2.30031137e-03])

In [100]:
final_model = rnd_search.best_estimator_

feature_importances = final_model['rf'].feature_importances_
feature_importances

#Add names to features

extra_attribs = ["rooms_per_hhold", "pop_per_hhold"]

cat_encoder = final_model['attr'].named_transformers_['cat_attr']
cat_attr = list(cat_encoder.categories_[0])
attributes = num_attr + extra_attribs + cat_attr
sorted(zip(feature_importances, attributes), reverse=True)

[(0.47637236549786954, 'median_income'),
 (0.13985323260136334, 'INLAND'),
 (0.12411259450052231, 'pop_per_hhold'),
 (0.060620507801238156, 'longitude'),
 (0.05755417434153923, 'latitude'),
 (0.04700118959014651, 'housing_median_age'),
 (0.034985121961999044, 'rooms_per_hhold'),
 (0.01439128733172556, 'total_rooms'),
 (0.013952861720317988, 'total_bedrooms'),
 (0.013481078008234882, 'households'),
 (0.013132767401852577, 'population'),
 (0.002300311373304713, 'NEAR OCEAN'),
 (0.0013280884796346422, '<1H OCEAN'),
 (0.0008256688550985803, 'NEAR BAY'),
 (8.87505351528258e-05, 'ISLAND')]

In [121]:
#Full predict pipeline with feature selection
k=5
full_predict_pipeline = Pipeline([
                          ('attr', feat_pipeline),
                          ('feature_selection', TopFeatures(feature_importances, k)),
                          ('rf', RandomForestRegressor())
])

full_predict_pipeline.fit(housing_x, housing_y)

Pipeline(memory=None,
         steps=[('attr',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_attr',
                                                  Pipeline(memory=None,
                                                           steps=[('impute',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                                   

In [130]:
#Grid search
from sklearn.model_selection import GridSearchCV


#Grid params to be named as per estimator names; check with get_params().keys()
grid_params = {
                'attr__num_attr__impute__strategy': ['mean'],
               'feature_selection__k': list(range(5, len(feature_importances)))
}

grid_search = GridSearchCV(full_predict_pipeline, grid_params, cv=2,
                           scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_x, housing_y)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=5 ...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=5, total=   3.3s
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=5 ...


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s remaining:    0.0s


[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=5, total=   3.3s
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=6 ...
[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=6, total=   3.8s
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=6 ...
[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=6, total=   3.8s
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=7 ...
[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=7, total=   4.5s
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=7 ...
[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=7, total=   4.5s
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=8 ...
[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=8, total=   5.2s
[CV] attr__num_attr__impute__strategy=mean, feature_selection__k=8 ...
[CV]  attr__num_attr__impute__strategy=mean, feature_selection__k=8, total= 

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  2.0min finished


GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('attr',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_attr',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('impute',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                        

In [129]:
full_predict_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'attr', 'feature_selection', 'rf', 'attr__n_jobs', 'attr__remainder', 'attr__sparse_threshold', 'attr__transformer_weights', 'attr__transformers', 'attr__verbose', 'attr__num_attr', 'attr__cat_attr', 'attr__num_attr__memory', 'attr__num_attr__steps', 'attr__num_attr__verbose', 'attr__num_attr__impute', 'attr__num_attr__add_attr', 'attr__num_attr__scale', 'attr__num_attr__impute__add_indicator', 'attr__num_attr__impute__copy', 'attr__num_attr__impute__fill_value', 'attr__num_attr__impute__missing_values', 'attr__num_attr__impute__strategy', 'attr__num_attr__impute__verbose', 'attr__num_attr__add_attr__add_bedrooms_per_rooms', 'attr__num_attr__add_attr__extra_attr_id', 'attr__num_attr__scale__copy', 'attr__num_attr__scale__with_mean', 'attr__num_attr__scale__with_std', 'attr__cat_attr__categories', 'attr__cat_attr__drop', 'attr__cat_attr__dtype', 'attr__cat_attr__handle_unknown', 'attr__cat_attr__sparse', 'feature_selection__feature_importances', 

In [131]:
grid_search.best_params_

{'attr__num_attr__impute__strategy': 'mean', 'feature_selection__k': 7}

In [135]:
np.array(feature_importances)[-5:]

array([1.32808848e-03, 1.39853233e-01, 8.87505352e-05, 8.25668855e-04,
       2.30031137e-03])