# Pipelines and Pandas

This is a short post about how to use Scikit-Learn Pipelines so that you have 'Pandas in, pandas out'. I'll build a small data pipeline on the Ames Iowa housing dataset. The first thing is to import the dataset, and inspect it! The data has 82 columns which include 23 nominal, 23 ordinal, 14 discrete, and 20 continuous variables (and 2 additional observation identifiers).

In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
ames = pd.read_csv('ames.csv')

In [3]:
ames.head(10)

Unnamed: 0,Order,PID,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Street,Alley,Lot.Shape,Land.Contour,...,Pool.Area,Pool.QC,Fence,Misc.Feature,Misc.Val,Mo.Sold,Yr.Sold,Sale.Type,Sale.Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
5,6,527105030,60,RL,78.0,9978,Pave,,IR1,Lvl,...,0,,,,0,6,2010,WD,Normal,195500
6,7,527127150,120,RL,41.0,4920,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,213500
7,8,527145080,120,RL,43.0,5005,Pave,,IR1,HLS,...,0,,,,0,1,2010,WD,Normal,191500
8,9,527146030,120,RL,39.0,5389,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,236500
9,10,527162130,60,RL,60.0,7500,Pave,,Reg,Lvl,...,0,,,,0,6,2010,WD,Normal,189000


It's always a good idea to set an index on a DataFrame if you have one. In this case, the `PID` column is a unique identifier 

In [4]:
ames = ames.set_index('PID').copy()

In [5]:
ames.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Order,2930.0,1465.5,845.96247,1.0,733.25,1465.5,2197.75,2930.0
MS.SubClass,2930.0,57.387372,42.638025,20.0,20.0,50.0,70.0,190.0
Lot.Frontage,2440.0,69.22459,23.365335,21.0,58.0,68.0,80.0,313.0
Lot.Area,2930.0,10147.921843,7880.017759,1300.0,7440.25,9436.5,11555.25,215245.0
Overall.Qual,2930.0,6.094881,1.411026,1.0,5.0,6.0,7.0,10.0
Overall.Cond,2930.0,5.56314,1.111537,1.0,5.0,5.0,6.0,9.0
Year.Built,2930.0,1971.356314,30.245361,1872.0,1954.0,1973.0,2001.0,2010.0
Year.Remod.Add,2930.0,1984.266553,20.860286,1950.0,1965.0,1993.0,2004.0,2010.0
Mas.Vnr.Area,2907.0,101.896801,179.112611,0.0,0.0,0.0,164.0,1600.0
BsmtFin.SF.1,2929.0,442.629566,455.590839,0.0,0.0,370.0,734.0,5644.0


In [6]:
ames.shape

(2930, 81)

The main comment is that there's a lof of missing data, and that some columns should be dropped entirely (in particular Alley, Pool QC, Fence, Misc Feature, Fireplace QC). There's also a mixture of categorical and numerical features, which is a little tricky to handle. Luckily sklearn has the `FeatureUnion` and `Pipeline` objects to help us.

In [7]:
ames.isnull().sum()

Order                0
MS.SubClass          0
MS.Zoning            0
Lot.Frontage       490
Lot.Area             0
Street               0
Alley             2732
Lot.Shape            0
Land.Contour         0
Utilities            0
Lot.Config           0
Land.Slope           0
Neighborhood         0
Condition.1          0
Condition.2          0
Bldg.Type            0
House.Style          0
Overall.Qual         0
Overall.Cond         0
Year.Built           0
Year.Remod.Add       0
Roof.Style           0
Roof.Matl            0
Exterior.1st         0
Exterior.2nd         0
Mas.Vnr.Type        23
Mas.Vnr.Area        23
Exter.Qual           0
Exter.Cond           0
Foundation           0
                  ... 
Bedroom.AbvGr        0
Kitchen.AbvGr        0
Kitchen.Qual         0
TotRms.AbvGrd        0
Functional           0
Fireplaces           0
Fireplace.Qu      1422
Garage.Type        157
Garage.Yr.Blt      159
Garage.Finish      159
Garage.Cars          1
Garage.Area          1
Garage.Qual

First of all, let's build a transformer which drops the columns we suggest. A sklearn compatible transformer is a class which has to have two methods `fit` (which returns `self`), and `transform` (which can return whatever you want). It's a good idea to inherit from `sklearn.base.TransformerMixin` and `sklearn.base.BaseEstimator`. The general pattern of a Transformer is:

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class ExampleTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return do_something_to(self, X)

These are actually quite simple, and can be quite flexible. Later on, we might see a transformer that uses the `fit` method. Anyway, here's a column dropper. 

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LarsCV
from sklearn.model_selection import  train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [10]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    '''
    Transformer to drop a list of cols
    '''
    
    def __init__(self, drop_cols):
        self._drop_cols = drop_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        df = df.drop(self._drop_cols, axis=1)
        return df

In [11]:
y = ames['SalePrice'].copy()
X = ames.drop('SalePrice', axis=1).copy()
X.columns

Index(['Order', 'MS.SubClass', 'MS.Zoning', 'Lot.Frontage', 'Lot.Area',
       'Street', 'Alley', 'Lot.Shape', 'Land.Contour', 'Utilities',
       'Lot.Config', 'Land.Slope', 'Neighborhood', 'Condition.1',
       'Condition.2', 'Bldg.Type', 'House.Style', 'Overall.Qual',
       'Overall.Cond', 'Year.Built', 'Year.Remod.Add', 'Roof.Style',
       'Roof.Matl', 'Exterior.1st', 'Exterior.2nd', 'Mas.Vnr.Type',
       'Mas.Vnr.Area', 'Exter.Qual', 'Exter.Cond', 'Foundation', 'Bsmt.Qual',
       'Bsmt.Cond', 'Bsmt.Exposure', 'BsmtFin.Type.1', 'BsmtFin.SF.1',
       'BsmtFin.Type.2', 'BsmtFin.SF.2', 'Bsmt.Unf.SF', 'Total.Bsmt.SF',
       'Heating', 'Heating.QC', 'Central.Air', 'Electrical', 'X1st.Flr.SF',
       'X2nd.Flr.SF', 'Low.Qual.Fin.SF', 'Gr.Liv.Area', 'Bsmt.Full.Bath',
       'Bsmt.Half.Bath', 'Full.Bath', 'Half.Bath', 'Bedroom.AbvGr',
       'Kitchen.AbvGr', 'Kitchen.Qual', 'TotRms.AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace.Qu', 'Garage.Type', 'Garage.Yr.Blt',
       'Gar

In [12]:
pipe = Pipeline([('dropper', ColumnDropper(['Alley', 'Pool.QC', 'Fence', 'Misc.Feature', 'Fireplace.Qu', 'Order']))])
X_trans = pipe.fit_transform(X)
X_trans.columns

Index(['MS.SubClass', 'MS.Zoning', 'Lot.Frontage', 'Lot.Area', 'Street',
       'Lot.Shape', 'Land.Contour', 'Utilities', 'Lot.Config', 'Land.Slope',
       'Neighborhood', 'Condition.1', 'Condition.2', 'Bldg.Type',
       'House.Style', 'Overall.Qual', 'Overall.Cond', 'Year.Built',
       'Year.Remod.Add', 'Roof.Style', 'Roof.Matl', 'Exterior.1st',
       'Exterior.2nd', 'Mas.Vnr.Type', 'Mas.Vnr.Area', 'Exter.Qual',
       'Exter.Cond', 'Foundation', 'Bsmt.Qual', 'Bsmt.Cond', 'Bsmt.Exposure',
       'BsmtFin.Type.1', 'BsmtFin.SF.1', 'BsmtFin.Type.2', 'BsmtFin.SF.2',
       'Bsmt.Unf.SF', 'Total.Bsmt.SF', 'Heating', 'Heating.QC', 'Central.Air',
       'Electrical', 'X1st.Flr.SF', 'X2nd.Flr.SF', 'Low.Qual.Fin.SF',
       'Gr.Liv.Area', 'Bsmt.Full.Bath', 'Bsmt.Half.Bath', 'Full.Bath',
       'Half.Bath', 'Bedroom.AbvGr', 'Kitchen.AbvGr', 'Kitchen.Qual',
       'TotRms.AbvGrd', 'Functional', 'Fireplaces', 'Garage.Type',
       'Garage.Yr.Blt', 'Garage.Finish', 'Garage.Cars', 'Garage.Area'

Another option, especially with columns with missing values, is to impute the value but to include a column telling the model where the imputed values are.

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin


class ImputeWithDummy(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols_to_impute, strategy, fill='NA'):
        self.cols_to_impute = cols_to_impute
        self.strategy = strategy
        self.fill = fill
        
    def fit(self, X, y=None, **kwargs):
        if self.strategy == 'mean':
            self.fill = X.mean()
        elif self.strategy == 'median':
            self.fill = X.median()
        elif self.strategy == 'mode':
            self.fill = X.mode().iloc[0]
        elif self.strategy == 'fill':
            if type(self.fill) is list and type(X) is pd.DataFrame:
                self.fill = dict([(cname, v) for cname,v in zip(X.columns, self.fill)])
        return self
    
    def transform(self, X):
        df = X.copy()
        for col in self.cols_to_impute:
            df['{}_missing'.format(col)] = df[col].isnull().astype(int)
            df[col] = df[col].fillna(self.fill[col])
        return df
    
X = pd.read_csv('ames.csv')
     
imputer = ImputeWithDummy(['Alley'], strategy='mode')

X_transformed = imputer.fit_transform(X)

X_transformed[['Alley', 'Alley_missing']].head()

Unnamed: 0,Alley,Alley_missing
0,Grvl,1
1,Grvl,1
2,Grvl,1
3,Grvl,1
4,Grvl,1


Of course, you should always read the data documentation (https://ww2.amstat.org/publications/jse/v19n3/decock/datadocumentation.txt), and there you'll see for `Alley` that NaN means `No Alley Access`, and that we don't need to any imputation at all!

In [14]:
class NaNImpute(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols, fill_vals):
        self.cols = cols
        self.fill_vals = fill_vals
        
    def fit(self, X, y=None, **kwargs):
        return self
    
    def transform(self, X):
        df = X.copy()
        for i, col in enumerate(self.cols):
            df[col].fillna(self.fill_vals[i])

The other thing we'll need to consider is that some columns will need to be converted to numeric features first, before an estimator can be fitted. First we'll fit an imputer, and then encode.

In [15]:
class DummyEncoding(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):

        self.columns = columns

    def fit(self, X, y=None, **kwargs):
        return self
    
    def transform(self, X, y=None, **kwargs):
        return pd.get_dummies(X, columns=self.columns, drop_first=True)

In [16]:
impute_cols = ['Alley', 'Pool.QC', 'Fence', 'Misc.Feature', 'Fireplace.Qu']
pipe = Pipeline([('impute', ImputeWithDummy(impute_cols, strategy='mode')), ('encode', DummyEncoding(impute_cols))])

X = pd.read_csv('ames.csv')

X_trans = pipe.fit_transform(X)
X_trans.columns

Index(['Order', 'PID', 'MS.SubClass', 'MS.Zoning', 'Lot.Frontage', 'Lot.Area',
       'Street', 'Lot.Shape', 'Land.Contour', 'Utilities', 'Lot.Config',
       'Land.Slope', 'Neighborhood', 'Condition.1', 'Condition.2', 'Bldg.Type',
       'House.Style', 'Overall.Qual', 'Overall.Cond', 'Year.Built',
       'Year.Remod.Add', 'Roof.Style', 'Roof.Matl', 'Exterior.1st',
       'Exterior.2nd', 'Mas.Vnr.Type', 'Mas.Vnr.Area', 'Exter.Qual',
       'Exter.Cond', 'Foundation', 'Bsmt.Qual', 'Bsmt.Cond', 'Bsmt.Exposure',
       'BsmtFin.Type.1', 'BsmtFin.SF.1', 'BsmtFin.Type.2', 'BsmtFin.SF.2',
       'Bsmt.Unf.SF', 'Total.Bsmt.SF', 'Heating', 'Heating.QC', 'Central.Air',
       'Electrical', 'X1st.Flr.SF', 'X2nd.Flr.SF', 'Low.Qual.Fin.SF',
       'Gr.Liv.Area', 'Bsmt.Full.Bath', 'Bsmt.Half.Bath', 'Full.Bath',
       'Half.Bath', 'Bedroom.AbvGr', 'Kitchen.AbvGr', 'Kitchen.Qual',
       'TotRms.AbvGrd', 'Functional', 'Fireplaces', 'Garage.Type',
       'Garage.Yr.Blt', 'Garage.Finish', 'Garage.Cars

In [17]:
class TypeSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, dtype):
        self.dtype = dtype
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [18]:
X = pd.read_csv('ames.csv')

In [19]:
select = TypeSelector('float64')

In [20]:
select.fit_transform(X).head()

Unnamed: 0,Lot.Frontage,Mas.Vnr.Area,BsmtFin.SF.1,BsmtFin.SF.2,Bsmt.Unf.SF,Total.Bsmt.SF,Bsmt.Full.Bath,Bsmt.Half.Bath,Garage.Yr.Blt,Garage.Cars,Garage.Area
0,141.0,112.0,639.0,0.0,441.0,1080.0,1.0,0.0,1960.0,2.0,528.0
1,80.0,0.0,468.0,144.0,270.0,882.0,0.0,0.0,1961.0,1.0,730.0
2,81.0,108.0,923.0,0.0,406.0,1329.0,0.0,0.0,1958.0,1.0,312.0
3,93.0,0.0,1065.0,0.0,1045.0,2110.0,1.0,0.0,1968.0,2.0,522.0
4,74.0,0.0,791.0,0.0,137.0,928.0,0.0,0.0,1997.0,2.0,482.0


In [21]:
select.fit_transform(X).columns

Index(['Lot.Frontage', 'Mas.Vnr.Area', 'BsmtFin.SF.1', 'BsmtFin.SF.2',
       'Bsmt.Unf.SF', 'Total.Bsmt.SF', 'Bsmt.Full.Bath', 'Bsmt.Half.Bath',
       'Garage.Yr.Blt', 'Garage.Cars', 'Garage.Area'],
      dtype='object')

In [22]:
select = TypeSelector('int64')
select.fit_transform(X).head()

Unnamed: 0,Order,PID,MS.SubClass,Lot.Area,Overall.Qual,Overall.Cond,Year.Built,Year.Remod.Add,X1st.Flr.SF,X2nd.Flr.SF,...,Wood.Deck.SF,Open.Porch.SF,Enclosed.Porch,X3Ssn.Porch,Screen.Porch,Pool.Area,Misc.Val,Mo.Sold,Yr.Sold,SalePrice
0,1,526301100,20,31770,6,5,1960,1960,1656,0,...,210,62,0,0,0,0,0,5,2010,215000
1,2,526350040,20,11622,5,6,1961,1961,896,0,...,140,0,0,0,120,0,0,6,2010,105000
2,3,526351010,20,14267,6,6,1958,1958,1329,0,...,393,36,0,0,0,0,12500,6,2010,172000
3,4,526353030,20,11160,7,5,1968,1968,2110,0,...,0,0,0,0,0,0,0,4,2010,244000
4,5,527105010,60,13830,5,5,1997,1998,928,701,...,212,34,0,0,0,0,0,3,2010,189900


In [23]:
select.fit_transform(X).columns

Index(['Order', 'PID', 'MS.SubClass', 'Lot.Area', 'Overall.Qual',
       'Overall.Cond', 'Year.Built', 'Year.Remod.Add', 'X1st.Flr.SF',
       'X2nd.Flr.SF', 'Low.Qual.Fin.SF', 'Gr.Liv.Area', 'Full.Bath',
       'Half.Bath', 'Bedroom.AbvGr', 'Kitchen.AbvGr', 'TotRms.AbvGrd',
       'Fireplaces', 'Wood.Deck.SF', 'Open.Porch.SF', 'Enclosed.Porch',
       'X3Ssn.Porch', 'Screen.Porch', 'Pool.Area', 'Misc.Val', 'Mo.Sold',
       'Yr.Sold', 'SalePrice'],
      dtype='object')

From inspecting these columns, we can see that a few categorical columns have slipped through disguised as numerical columns - in particular `Overall.Qual1`, `Overall.Cond`, `Year.Built`, `Year.Remod.Add`, `Mo.Sold`, `Yr.Sold`. 

In [24]:
float_cols = ['Lot.Frontage', 
              'Mas.Vnr.Area', 
              'BsmtFin.SF.1', 
              'BsmtFin.SF.2', 
              'Bsmt.Unf.SF',
              'Total.Bsmt.SF',
              'Garage.Cars',
              'Garage.Area'
             ]

int_cols = ['MS.SubClass', 
            'Lot.Area',
            'X1st.Flr.SF',
            'X2nd.Flr.SF', 
            'Low.Qual.Fin.SF', 
            'Gr.Liv.Area', 
            'Full.Bath',
            'Half.Bath', 
            'Bedroom.AbvGr', 
            'Kitchen.AbvGr', 
            'TotRms.AbvGrd',
            'Fireplaces', 
            'Wood.Deck.SF', 
            'Open.Porch.SF', 
            'Enclosed.Porch',
            'X3Ssn.Porch', 
            'Screen.Porch', 
            'Pool.Area', 
            'Misc.Val'
           ]

Finally, we need some way to deal with ordinal features:

  - Lot Shape (Ordinal): General shape of property
  - Utilities (Ordinal): Type of utilities available
  - Land Slope (Ordinal): Slope of property
  - Overall Qual (Ordinal): Rates the overall material and finish of the house
  - Overall Cond (Ordinal): Rates the overall condition of the house
  - Exter Qual (Ordinal): Evaluates the quality of the material on the exterior
  - Exter Cond (Ordinal): Evaluates the present condition of the material on the exterior
  - Bsmt Qual (Ordinal): Evaluates the height of the basement
  - Bsmt Cond (Ordinal): Evaluates the general condition of the basement
  - Bsmt Exposure	(Ordinal): Refers to walkout or garden level walls
  - BsmtFin Type 1	(Ordinal): Rating of basement finished area
  - BsmtFin Type 2	(Ordinal): Rating of basement finished area (if multiple types)
  - HeatingQC (Ordinal): Heating quality and condition
  - Electrical (Ordinal): Electrical system
  - FireplaceQu (Ordinal): Fireplace quality
  - Garage Finish (Ordinal)	: Interior finish of the garage
  - Garage Qual (Ordinal): Garage quality
  - Garage Cond (Ordinal): Garage condition
  - Paved Drive (Ordinal): Paved driveway
  - Pool QC (Ordinal): Pool quality
  - Fence (Ordinal): Fence quality

To do this we could use the OrdinalEncoder from http://contrib.scikit-learn.org/categorical-encoding/, which will be included in sklearn in a future release  - but I have trouble getting this to work with Pandas. Another choice is just to write our own, which is what I'll do.

In [25]:
X = pd.read_csv('ames.csv')
X.columns

Index(['Order', 'PID', 'MS.SubClass', 'MS.Zoning', 'Lot.Frontage', 'Lot.Area',
       'Street', 'Alley', 'Lot.Shape', 'Land.Contour', 'Utilities',
       'Lot.Config', 'Land.Slope', 'Neighborhood', 'Condition.1',
       'Condition.2', 'Bldg.Type', 'House.Style', 'Overall.Qual',
       'Overall.Cond', 'Year.Built', 'Year.Remod.Add', 'Roof.Style',
       'Roof.Matl', 'Exterior.1st', 'Exterior.2nd', 'Mas.Vnr.Type',
       'Mas.Vnr.Area', 'Exter.Qual', 'Exter.Cond', 'Foundation', 'Bsmt.Qual',
       'Bsmt.Cond', 'Bsmt.Exposure', 'BsmtFin.Type.1', 'BsmtFin.SF.1',
       'BsmtFin.Type.2', 'BsmtFin.SF.2', 'Bsmt.Unf.SF', 'Total.Bsmt.SF',
       'Heating', 'Heating.QC', 'Central.Air', 'Electrical', 'X1st.Flr.SF',
       'X2nd.Flr.SF', 'Low.Qual.Fin.SF', 'Gr.Liv.Area', 'Bsmt.Full.Bath',
       'Bsmt.Half.Bath', 'Full.Bath', 'Half.Bath', 'Bedroom.AbvGr',
       'Kitchen.AbvGr', 'Kitchen.Qual', 'TotRms.AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace.Qu', 'Garage.Type', 'Garage.Yr.Blt',
    

In [26]:
ord_cols     = ['Lot.Shape',
                'Utilities',
                'Land.Slope',
                'Overall.Qual',
                'Overall.Cond',
                'Exter.Qual', 
                'Exter.Cond',
                'Bsmt.Qual',
                'Bsmt.Cond', 
                'Bsmt.Exposure', 
                'BsmtFin.Type.1', 
                'BsmtFin.SF.1',
                'Heating.QC',
                'Electrical',
                'Fireplace.Qu',
                'Garage.Finish',
                'Garage.Qual',
                'Garage.Cond',
                'Paved.Drive',
                'Pool.QC',
                'Fence',
               ]

cat_cols = ['MS.SubClass',
            'MS.Zoning',
            'Street',
            'Alley',
            'Land.Contour',
            'Lot.Config',
            'Neighborhood',
            'Condition.1',
            'Condition.2',
            'Bldg.Type',
            'House.Style',
            'Roof.Style',
            'Exterior.1st', 
            'Exterior.2nd',
            'Mas.Vnr.Type',
            'Foundation',
            'Heating',
            'Central.Air',
            'Garage.Type',
            'Misc.Feature',
            'Sale.Type',
            'Sale.Condition'
]

In [27]:
class OrdinalEncoding(BaseEstimator, TransformerMixin):
    
    def __init__(self, col, scale_map=None, drop_original=False):
        self.col = col
        self.scale_map = scale_map
        self.drop_original = drop_original
        
    def fit(self, X, y=None, **kwargs):
        if self.scale_map is None:
            keys = X[self.col].unique()
            vals = list(range(len(keys)))
            self.scale_map = dict(zip(keys, vals))
        return self
    
    def transform(self, X):
        df = X.copy()
        df['{}_scaled'.format(self.col)] = df[self.col].replace(self.scale_map)
        if self.drop_original:
            df = df.drop(self.col, axis=1)
        return df

In [28]:
X = pd.read_csv('ames.csv')

In [29]:
scale_map = {'Reg':3, 'IR1':2, 'IR2':1, 'IR3':0}

In [30]:
encoder = OrdinalEncoding('Lot.Shape', scale_map, drop_original=True)
X_trans = encoder.transform(X)

In [31]:
encoder = OrdinalEncoding('Lot.Shape', drop_original=False)
X_trans = encoder.fit_transform(X)

In [32]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    '''
    Select columns from pandas dataframe by specifying a list of column names
    '''
    def __init__(self, col_names):
        self.col_names = col_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.col_names]

In [33]:
class Scale(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.cols = []
        self.index = []
        
    def fit(self, X, y=None, **kwargs):
        self.scaler.fit(X)
        self.cols = X.columns
        self.index = X.index
        return self
        
    def transform(self, X):
        df = X.copy()
        df = self.scaler.transform(df)
        print(df.shape)
        df = pd.DataFrame(df, columns=self.cols, index=self.index)
        return df

In [81]:
class CategoricalEncode(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None, **kwargs):
        return self
    
    def transform(self, X):
        df = X.copy()
        for col in self.cols:
            df[col] = df[col].astype('category')
        return df

In [34]:
from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one
from scipy import sparse


class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, weight, X, y,
                                        **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, weight, X)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [132]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, LabelBinarizer
from category_encoders import OneHotEncoder

numerical_cols = int_cols + float_cols

pipe = Pipeline([
    ('features', FeatureUnion(n_jobs=1, transformer_list=[
        ('numericals', Pipeline([
             ('selector', DataFrameSelector(numerical_cols)),
             ('imputer', ImputeWithDummy(numerical_cols, strategy='mean')),
             ('scaling', StandardScaler())
        ])),
        
        ('categoricals', Pipeline([
             ('selector', DataFrameSelector(cat_cols)),
             ('imputer', ImputeWithDummy(cat_cols, strategy='mode')),
             ('categorical_encoding', OneHotEncoder(cat_cols))
        ])),x
    ])), 
])  

In [133]:
ames = pd.read_csv('ames.csv')

y = ames['SalePrice']

X = ames.drop(['SalePrice', 'PID', 'Order'], axis=1)

X_trans = pipe.fit_transform(ames)

AttributeError: 'DataFrame' object has no attribute 'unique'

In [None]:
np.any(np.isnan(X_trans))

In [None]:
from sklearn.linear_model import LarsCV

In [None]:
import warnings
warnings.filterwarnings("ignore")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
model = Pipeline([('pipeline', pipe), ('clf', LarsCV())])
model.fit(X_train, y_train)

In [None]:
model.steps[-1][1].coef_.shape

In [None]:
pipe.fit_transform(X_test).shape

In [None]:
pipe.fit_transform(X_train).shape

In [None]:
preds = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mean_squared_error(preds, y_test)

In [134]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def rmse_cv(model, X, y):
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=3))
    return(rmse)

In [135]:
rmse_cv(model, X, y)

array([ 37112.71783827,  37112.0438335 ,  39275.64714113])

In [136]:
def get_score(prediction, labels):    
    print('R2: {}'.format(r2_score(prediction, labels)))
    print('RMSE: {}'.format(rmse_cv))

In [137]:
get_score(preds, y_test)

R2: 0.7554860856119494
RMSE: <function rmse_cv at 0x116b1e268>
