# Feature engineering
Use our existing features to develop new features, convert categorical values to one-hot vectors, and drop others. We will "fit" to the training data and then transform to the test data

In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
   
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Read in our test and training data

In [2]:
DATA_FOLDER = os.path.join('data', 'final')

In [3]:
zoopla_df_train = pd.read_csv(os.path.join(DATA_FOLDER, 'zoopla_train.csv'), dtype=str)
zoopla_df_test = pd.read_csv(os.path.join(DATA_FOLDER, 'zoopla_test.csv'), dtype=str)

In [4]:
zoopla_df_train.columns

Index(['listing_id', 'parish', 'post_town', 'postcode', 'latitude',
       'longitude', 'num_bedrooms', 'num_bathrooms', 'price',
       'property_type_general', 'CURRENT_ENERGY_RATING_mode',
       'POTENTIAL_ENERGY_RATING_mode', 'TOTAL_FLOOR_AREA_median',
       'NUMBER_HABITABLE_ROOMS_mode', 'CONSTRUCTION_AGE_BAND_mode',
       'Index of Multiple Deprivation Decile', 'Income Decile',
       'Employment Decile', 'Education and Skills Decile',
       'Health and Disability Decile', 'Crime Decile',
       'Barriers to Housing and Services Decile', 'Living Environment Decile',
       'IDACI Decile', 'IDAOPI Decile', 'PROB_4BAND', 'diff_published_date',
       'last_published_year', 'last_published_month', 'first_published_year',
       'first_published_month'],
      dtype='object')

In [5]:
zoopla_df_train.head()

Unnamed: 0,listing_id,parish,post_town,postcode,latitude,longitude,num_bedrooms,num_bathrooms,price,property_type_general,CURRENT_ENERGY_RATING_mode,POTENTIAL_ENERGY_RATING_mode,TOTAL_FLOOR_AREA_median,NUMBER_HABITABLE_ROOMS_mode,CONSTRUCTION_AGE_BAND_mode,Index of Multiple Deprivation Decile,Income Decile,Employment Decile,Education and Skills Decile,Health and Disability Decile,Crime Decile,Barriers to Housing and Services Decile,Living Environment Decile,IDACI Decile,IDAOPI Decile,PROB_4BAND,diff_published_date,last_published_year,last_published_month,first_published_year,first_published_month
0,62385013,"Hinckley and Bosworth, unparished area",Hinckley,LE10 0,52.5503,-1.386379,4,2,400000.0,Detached house,5.0,3.0,116.0,5.0,3.0,3,3,2,1,5,5,9,9,4,5,,0,2022,9,2022,9
1,62716145,"Nuneaton and Bedworth, unparished area",Nuneaton,CV10 9,52.525043,-1.515505,3,1,250000.0,Semi-detached house,4.0,2.0,97.051,4.0,5.0,6,6,6,4,4,7,6,8,5,9,,1,2022,11,2022,10
2,62579010,"Nuneaton and Bedworth, unparished area",Nuneaton,CV10 0,52.534294,-1.454733,3,1,290000.0,Semi-detached house,4.0,2.0,75.25,4.0,3.0,9,9,8,8,6,7,8,6,10,8,,2,2023,1,2022,10
3,62485357,"Hinckley and Bosworth, unparished area",Hinckley,LE10 1,52.544140000000006,-1.364725,4,2,550000.0,Detached house,5.0,3.0,179.0,8.0,1.0,5,5,6,3,3,4,10,4,5,5,,2,2022,12,2022,9
4,62478555,"Nuneaton and Bedworth, unparished area",Nuneaton,CV10 8,52.519398,-1.493818,3,0,235000.0,Semi-detached house,3.0,2.0,71.0,5.0,5.0,4,4,4,3,4,2,9,5,5,5,,4,2023,2,2022,9


Separate out numeric (with a further separation for numeric vars we want converted to KMeans clusters), ordered discrete and categoric variables

In [6]:
numeric_cols = ['TOTAL_FLOOR_AREA_median', 'diff_published_date']

numeric_kmeans_cols = ['latitude', 'longitude']

discrete_cols = ['num_bedrooms', 'num_bathrooms', 'CURRENT_ENERGY_RATING_mode',
       'POTENTIAL_ENERGY_RATING_mode', 'CONSTRUCTION_AGE_BAND_mode',
       'Index of Multiple Deprivation Decile', 'Income Decile',
       'Employment Decile', 'Education and Skills Decile',
       'Health and Disability Decile', 'Crime Decile',
       'Barriers to Housing and Services Decile', 'Living Environment Decile',
       'IDACI Decile', 'IDAOPI Decile', 'last_published_year', 'last_published_month', 
        'first_published_year', 'first_published_month']

categoric_cols = ['post_town', 'parish', 'postcode', 'PROB_4BAND', 'property_type_general']

### Transform the four types of variables separately in a pipeline
Set up pipelines. For null values, we use the median in the training data for numeric fields and the mode for discrete and categoric fields.

In [7]:
class KMeansCluster(BaseEstimator, TransformerMixin):
    
    """
    Apply KMeans clustering to longitude and latitude. Want cluster centres fit to training
    data and the test data to be transformed based on the fits
    """
    
    def __init__(self, n_clusters=8):
        self.n_clusters = n_clusters        
            
    def fit(self, X, y=None):
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=40)
        self.kmeans.fit(X)
        return self
    
    def transform(self, X):
        cluster_centres = self.kmeans.transform(X).argmin(axis=1)
        return cluster_centres.reshape(cluster_centres.shape[0], 1) # converts 1D array to 2D
        
        # return original co-ordinates and cluster number
        #print(np.c_[X, cluster_centres])
        #return np.c_[X, cluster_centres]

In [8]:
numeric_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='median', add_indicator=False)),
    ('num_std_scaler', StandardScaler())
])

numeric_kmeans_pipeline = Pipeline([
    ('num_kmeans_imputer', SimpleImputer(strategy='median', add_indicator=False)),
    ('num_kmeans_cluster', KMeansCluster(8)),
    ('num_kmeans_oh_encoder', OneHotEncoder(drop='first'))
])

discrete_pipeline = Pipeline([
    ('dis_imputer', SimpleImputer(strategy='most_frequent', add_indicator=False)),
    ('dis_std_scaler', StandardScaler())
])

categoric_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent', add_indicator=False)),
    ('cat_oh_encoder', OneHotEncoder(drop='first'))
])

Set up a column transformer to handle all four types of columns together

In [9]:
full_pipeline = ColumnTransformer([
    ('numeric', numeric_pipeline, numeric_cols),
    ('numeric_kmeans', numeric_kmeans_pipeline, numeric_kmeans_cols),
    ('discrete', discrete_pipeline, discrete_cols),
    ('categoric', categoric_pipeline, categoric_cols)
])

Fit to training data and transform it. Then apply those fitted values to the test data.

In [10]:
X_train = full_pipeline.fit_transform(zoopla_df_train)
X_test = full_pipeline.transform(zoopla_df_test)

In [11]:
X_train[200]

array([-0.39082098, -0.6995524 ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        , -0.01901166,
       -0.4819992 ,  1.60041207, -0.1517802 , -0.53041424, -0.78841033,
       -0.4761415 , -0.72076054,  1.31552378, -1.13654453, -1.34230752,
       -1.06366227, -0.15211694,  0.43378577, -0.18976236, -1.53953426,
        1.79859224, -0.57635398,  0.98786238,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ])

In [12]:
full_pipeline.named_transformers_['categoric'][1].get_feature_names()

array(['x0_Nuneaton', 'x1_Hinckley and Bosworth, unparished area',
       'x1_Nuneaton and Bedworth, unparished area', 'x1_Other',
       'x1_Stoke Golding', 'x2_CV10 7', 'x2_CV10 8', 'x2_CV10 9',
       'x2_CV11 4', 'x2_CV11 5', 'x2_CV11 6', 'x2_CV11 7', 'x2_CV12 9',
       'x2_CV13 0', 'x2_CV13 6', 'x2_CV9 3', 'x2_LE10 0', 'x2_LE10 1',
       'x2_LE10 2', 'x2_LE10 3', 'x2_LE9 7', 'x3_Medium', 'x3_None',
       'x4_Detached house', 'x4_End terrace house', 'x4_Flat',
       'x4_Maisonette', 'x4_Other/Unknown', 'x4_Semi-detached house',
       'x4_Terraced house'], dtype=object)

In [13]:
full_pipeline.transformers_

[('numeric',
  Pipeline(steps=[('num_imputer', SimpleImputer(strategy='median')),
                  ('num_std_scaler', StandardScaler())]),
  ['TOTAL_FLOOR_AREA_median', 'diff_published_date']),
 ('numeric_kmeans',
  Pipeline(steps=[('num_kmeans_imputer', SimpleImputer(strategy='median')),
                  ('num_kmeans_cluster', KMeansCluster()),
                  ('num_kmeans_oh_encoder', OneHotEncoder(drop='first'))]),
  ['latitude', 'longitude']),
 ('discrete',
  Pipeline(steps=[('dis_imputer', SimpleImputer(strategy='most_frequent')),
                  ('dis_std_scaler', StandardScaler())]),
  ['num_bedrooms',
   'num_bathrooms',
   'CURRENT_ENERGY_RATING_mode',
   'POTENTIAL_ENERGY_RATING_mode',
   'CONSTRUCTION_AGE_BAND_mode',
   'Index of Multiple Deprivation Decile',
   'Income Decile',
   'Employment Decile',
   'Education and Skills Decile',
   'Health and Disability Decile',
   'Crime Decile',
   'Barriers to Housing and Services Decile',
   'Living Environment Decile',
   

In [14]:
full_pipeline.transformers_[3][1][1].get_feature_names()

array(['x0_Nuneaton', 'x1_Hinckley and Bosworth, unparished area',
       'x1_Nuneaton and Bedworth, unparished area', 'x1_Other',
       'x1_Stoke Golding', 'x2_CV10 7', 'x2_CV10 8', 'x2_CV10 9',
       'x2_CV11 4', 'x2_CV11 5', 'x2_CV11 6', 'x2_CV11 7', 'x2_CV12 9',
       'x2_CV13 0', 'x2_CV13 6', 'x2_CV9 3', 'x2_LE10 0', 'x2_LE10 1',
       'x2_LE10 2', 'x2_LE10 3', 'x2_LE9 7', 'x3_Medium', 'x3_None',
       'x4_Detached house', 'x4_End terrace house', 'x4_Flat',
       'x4_Maisonette', 'x4_Other/Unknown', 'x4_Semi-detached house',
       'x4_Terraced house'], dtype=object)