## ------Create a reproducible process for transforming the raw feature set into model data-------

Deature engineering and missing value imputation, which include
* filtering features
* encoding missing values with a unique value (for that column)
* encoding categorical features for use in common ML algorithms

In [1]:
import pandas as pd
import inspect

In [21]:
# Reading the merged raw data

df = pd.read_csv('/Users/sonalichaudhari/Desktop/Projects/Zillow/data/train_merged_data.csv',low_memory=False)
print(df.shape)
df.head()

(90275, 60)


Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016-01-01,1.0,,,2.0,3.0,,4.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016-01-01,,,,3.5,4.0,,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016-01-01,1.0,,,3.0,2.0,,4.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016-01-02,,,,2.5,4.0,,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


### Separating the target column from the train data

In [3]:
# Splitting data into train data and corresponding target
df = pd.read_csv('/Users/sonalichaudhari/Desktop/Projects/Zillow/data/train_merged_data.csv',low_memory=False)

train = df
y = df.pop('logerror')

In [4]:
train.shape

(90275, 59)

In [5]:
"""
A set of helper functions for the Kaggle/Zillow case study for Data Science Dream Job
"""
def get_data(dset):
    
    """Create the training dataset (2016) or the test dataset (2017)

    Keyword arguments:
    dset -- a string in {train, test}
    
    Returns:
    a tuple of pandas dataframe (X) and pandas series (y)
    """
    
    return train, y

The above helper function will have mutiple functions called inside it. So will be using this function after we combine the entire data pipeline code.

### Outline of the Preprocessor Object

In [6]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None):
        
        self.cols_to_filter = cols_to_filter
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        return X_new

### Developing step by step the Preprocessor Object

### 1. Dealing with columns from previous steps and parcelid

* rawcensustractandblock
* censustractandblock
* propertyzoningdesc
* regionidneighborhood
* regionidzip

In [7]:
cols = ['rawcensustractandblock','censustractandblock','propertyzoningdesc','regionidneighborhood','regionidzip','parcelid']
train = train.drop(cols,axis =1)
train.head()

Unnamed: 0,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear
0,2016-01-01,1.0,,,2.0,3.0,,4.0,2.0,,...,1959.0,,,122754.0,360170.0,2015.0,237416.0,6735.88,,
1,2016-01-01,,,,3.5,4.0,,,3.5,,...,2014.0,,,346458.0,585529.0,2015.0,239071.0,10153.02,,
2,2016-01-01,1.0,,,3.0,2.0,,4.0,3.0,,...,1940.0,,,61994.0,119906.0,2015.0,57912.0,11484.48,,
3,2016-01-02,1.0,,,2.0,2.0,,4.0,2.0,,...,1987.0,,,171518.0,244880.0,2015.0,73362.0,3048.74,,
4,2016-01-02,,,,2.5,4.0,,,2.5,,...,1981.0,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,


### 2.  Dealing with Date columns 

In [8]:
datecols=['transactiondate']

In [9]:
for x in datecols:
    train[x+"_month"] = train['transactiondate'].apply(lambda x: int(x[5:7]))
    train[x+"_year"]  = train['transactiondate'].apply(lambda x: int(x[ :4]))
    train = train.drop(x, axis=1)
    
train.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,transactiondate_month,transactiondate_year
0,1.0,,,2.0,3.0,,4.0,2.0,,,...,,122754.0,360170.0,2015.0,237416.0,6735.88,,,1,2016
1,,,,3.5,4.0,,,3.5,,,...,,346458.0,585529.0,2015.0,239071.0,10153.02,,,1,2016
2,1.0,,,3.0,2.0,,4.0,3.0,,,...,,61994.0,119906.0,2015.0,57912.0,11484.48,,,1,2016
3,1.0,,,2.0,2.0,,4.0,2.0,,,...,,171518.0,244880.0,2015.0,73362.0,3048.74,,,1,2016
4,,,,2.5,4.0,,,2.5,,,...,,169574.0,434551.0,2015.0,264977.0,5488.96,,,1,2016


### 3.  Encoding Categorical Columns
To be flexible during modeling, we can encode each categorical feature with a set of binary features in a process called "Dummy Coding."

In [10]:
all_a = train.columns
numerical_columns = train._get_numeric_data().columns
categorical_columns = list(set(train.columns) - set(numerical_columns))

In [11]:
categorical_columns

['taxdelinquencyflag',
 'fireplaceflag',
 'hashottuborspa',
 'propertycountylandusecode']

In [12]:
train.shape

(90275, 54)

In [13]:
train = pd.get_dummies(train,columns= categorical_columns, dummy_na=True)

In [14]:
len(train.columns)

134

### 4. Dealing with Null values
An easy strategy :All the numeric variables are positive and encode missing values with a -1. 
Let's check that assumption:

In [16]:
train.loc[:,train.isna().sum()>0].min()

airconditioningtypeid              1.00
architecturalstyletypeid           2.00
basementsqft                     100.00
buildingclasstypeid                4.00
buildingqualitytypeid              1.00
calculatedbathnbr                  1.00
decktypeid                        66.00
finishedfloor1squarefeet          44.00
calculatedfinishedsquarefeet       2.00
finishedsquarefeet12               2.00
finishedsquarefeet13            1056.00
finishedsquarefeet15             560.00
finishedsquarefeet50              44.00
finishedsquarefeet6              257.00
fireplacecnt                       1.00
fullbathcnt                        1.00
garagecarcnt                       0.00
garagetotalsqft                    0.00
heatingorsystemtypeid              1.00
lotsizesquarefeet                167.00
poolcnt                            1.00
poolsizesum                       28.00
pooltypeid10                       1.00
pooltypeid2                        1.00
pooltypeid7                        1.00


In [18]:
# Filling all Null Values by -1
train = train.fillna(-1)
train.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,propertycountylandusecode_1720,propertycountylandusecode_1722,propertycountylandusecode_200,propertycountylandusecode_34,propertycountylandusecode_38,propertycountylandusecode_6050,propertycountylandusecode_73,propertycountylandusecode_8800,propertycountylandusecode_96,propertycountylandusecode_nan
0,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,2.0,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
1,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,3.5,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,3.0,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,2.0,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
4,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,2.5,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0


## Combining all the above steps of preprocessing into the Preproessor object

In [19]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None, datecols=None):
        
        self.cols_to_filter = cols_to_filter
        self.datecols = datecols
        self.was_fit = False
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        
        self.was_fit = True
        
        # filter
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        categorical_features = X_new.dtypes[X_new.dtypes == 'object'].index
        self.categorical_features = [x for x in categorical_features if 'date' not in x]
        
        dummied = pd.get_dummies(X_new, columns=self.categorical_features, dummy_na=True)
        self.colnames = dummied.columns
        del dummied
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        
        if not self.was_fit:
            raise Error("need to fit preprocessor first")
        
        # filter
        X_new = X.drop(self.cols_to_filter, axis=1)

        # dummy code
        X_new = pd.get_dummies(X_new, columns=self.categorical_features, dummy_na=True)
        newcols = set(self.colnames) - set(X_new.columns)

        for x in newcols:
            X_new[x] = 0
            
        X_new = X_new[self.colnames]
                
        # fill NA after we dummy code
        X_new = X_new.fillna(-1)
        
        if self.datecols:
            for x in self.datecols:
                X_new[x + '_month'] = pd.to_datetime(X_new[x]).apply(lambda x: x.month)
                X_new[x + '_year'] = pd.to_datetime(X_new[x]).apply(lambda x: x.year)
                X_new = X_new.drop(x, axis=1)
        
        return X_new
    
    def fit_transform(self, X, y=None):
        """fit and transform wrapper method, used for sklearn pipeline"""

        return self.fit(X).transform(X)

In [22]:
train = df
y = df.pop('logerror')

In [23]:
p=preprocessor(cols_to_filter = ['rawcensustractandblock', 'censustractandblock', 'propertyzoningdesc', 
                                 'regionidneighborhood', 'regionidzip', 'parcelid'], datecols=['transactiondate'])
p.fit(train)

<__main__.preprocessor at 0x11b408e10>

In [24]:
train_transformed = p.transform(train)

In [26]:
train_transformed.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,propertycountylandusecode_73,propertycountylandusecode_8800,propertycountylandusecode_96,propertycountylandusecode_nan,fireplaceflag_True,fireplaceflag_nan,taxdelinquencyflag_Y,taxdelinquencyflag_nan,transactiondate_month,transactiondate_year
0,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,2.0,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
1,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,3.5,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
2,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,3.0,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
3,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,2.0,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
4,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,2.5,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
