In [332]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from pandas.api.types import is_numeric_dtype
from sklearn.metrics import mean_absolute_error

# Pipelines
Pipelines are a simple way to keep your data preprocessing and modeling code organized. Specifically, a pipeline
bundles preprocessing and modeling steps so you can use the whole bundle as if it were a single step.

Many data scientists hack together models without pipelines, but pipelines have some important benefits. Those include:

    1.  Cleaner Code: Accounting for data at each step of preprocessing can get messy. With a pipeline, you won't need
    to manually keep track of your training and validation data at each step.
    2.  Fewer Bugs: There are fewer opportunities to misapply a step or forget a preprocessing step.
    3.  Easier to Productionize: It can be surprisingly hard to transition a model from a prototype to something
    deployable at scale. We won't go into the many related concerns here, but pipelines can help.
    4.  More Options for Model Validation: You will see an example in the next tutorial, which covers cross-validation.

In [333]:
melbourne_df = pd.read_csv('data/Melbourne_housing_FULL.csv')

Remove rows with missing target, separate target from predictors, put None instead of Nan.

In [334]:
print(melbourne_df.head())

       Suburb             Address  Rooms Type      Price Method SellerG  \
0  Abbotsford       68 Studley St      2    h        NaN     SS  Jellis   
1  Abbotsford        85 Turner St      2    h  1480000.0      S  Biggin   
2  Abbotsford     25 Bloomburg St      2    h  1035000.0      S  Biggin   
3  Abbotsford  18/659 Victoria St      3    u        NaN     VB  Rounds   
4  Abbotsford        5 Charles St      3    h  1465000.0     SP  Biggin   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/09/2016       2.5    3067.0  ...       1.0  1.0     126.0           NaN   
1  3/12/2016       2.5    3067.0  ...       1.0  1.0     202.0           NaN   
2  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
3  4/02/2016       2.5    3067.0  ...       2.0  1.0       0.0           NaN   
4  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   

   YearBuilt         CouncilArea Lattitude  Longtitude             R

In the first step we use the target column as the pivot, in reference we drop other rows to.

Subset parameter limits the dropping process to the columns passed as parameter. Dropna(subset=['Price']) will drop
ONLY the rows corresponding to NaN's found ONLY in Price column.

Basically we want to clean the original DataFrame (drop rows) in respect to the Price column.

We should clean up the DataFrame in respect to the target data (drop rows corresponding to NaN's in target column),
because the features data will be imputed and one-hot-encoded in further steps.

Inplace means that the changes are done on DataFrame itself. It is impossible to assign new_df = df.drop(inplace=True),
if inplace is set to True.

In [335]:
melbourne_df.dropna(axis=0, subset=['Price'], inplace=True)

Set feature matrix X to the DataFrame except Price column and target array y to Price.

In [336]:
X = melbourne_df.drop(['Price'], axis=1, inplace=False)
y = melbourne_df['Price']

In [337]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=37)

Cardinality is defined as number of unique categorical values in a column.

We need to select the Low Cardinality Columns (number of unique labels <= 15), it is convenient but arbitrary.

Numerical columns are all numerical columns except Price.
Categorical columns are all categorical columns.
Features are numerical + categorical columns (except Price!).

In [338]:
numerical_columns = [col for col in X if is_numeric_dtype(melbourne_df[col])]
categorical_columns = [col for col in X if not is_numeric_dtype(melbourne_df[col])]

In [339]:
frequency_labels = {col: X[col].nunique() for col in X[categorical_columns].columns}
print(frequency_labels)

{'Suburb': 345, 'Address': 26751, 'Type': 3, 'Method': 5, 'SellerG': 349, 'Date': 78, 'CouncilArea': 33, 'Regionname': 8}


In [340]:
low_cardinality_columns = [col for col in X[categorical_columns] if X[col].nunique() <= 15]
print(low_cardinality_columns)

['Type', 'Method', 'Regionname']


We need to leave only the categorical columns with low cardinality.

Remember to copy during assignment, not to pass the objects by reference!

In [341]:
feature_columns = numerical_columns + low_cardinality_columns

In [342]:
X_train = X_train[feature_columns].copy()
X_test = X_test[feature_columns].copy()

# Step 1: Define Preprocessing Steps
Similar to how a pipeline bundles together preprocessing and modeling steps, we use the ColumnTransformer class to
bundle together different preprocessing steps. The code below:

    1.  imputes missing values in numerical data, and
    2.  imputes missing values and applies a one-hot encoding to categorical data.

In [343]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


Preprocessing for numerical data (using SimpleImputer)

In [344]:
numerical_transformer = SimpleImputer(strategy='constant')

Preprocessing for categorical data (using SimpleImputer and OneHotEncoder)

In [345]:
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

Bundle preprocessing for numerical and categorical data

In [346]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_columns),
        ('categorical', categorical_transformer, low_cardinality_columns)
    ]
)

# Step 2: Define The Model

In [347]:
melbourne_model = RandomForestRegressor(n_estimators=150, random_state=37)

# Step 3: Create and Evaluate the Pipeline
Finally, we use the Pipeline class to define a pipeline that bundles the preprocessing and modeling steps. There are a
few important things to notice:

    With the pipeline, we preprocess the training data and fit the model in a single line of code. (In contrast,
    without a pipeline, we have to do imputation, one-hot encoding, and model training in separate steps. This becomes
    especially messy if we have to deal with both numerical and categorical variables!)

    With the pipeline, we supply the unprocessed features in X_valid to the predict() command, and the pipeline
    automatically preprocesses the features before generating predictions. (However, without a pipeline, we have to
    remember to preprocess the validation data before making predictions.)

In [348]:
my_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', melbourne_model)
    ]
)

In [349]:
my_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical',
                                                  SimpleImputer(strategy='constant'),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
                                                   'Landsize', 'BuildingArea',
                                                   'YearBuilt', 'Lattitude',
                                                   'Longtitude',
                                                   'Propertycount']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                          

In [350]:
y_pred = my_pipeline.predict(X_test)

In [351]:
score = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:\n', score)


Mean Absolute Error:
 168200.83781076694
