# Preprocessing

In [22]:
import pandas as pd

In [23]:
import sys
import inspect
sys.path.insert(0, './modules')


# now read in the new function
from HelperFunctions import get_data
print(inspect.getsource(get_data))

def get_data(dset):
    
    """Create the training dataset (2016) or the test dataset (2017)

    Keyword arguments:
    dset -- a string in {train, test}
    
    Returns:
    a tuple of pandas dataframe (X) and pandas series (y)
    """
    
    year = {'train':2016, 'test':2017}[dset]
    
    train = read_in_dataset('train_{0}'.format(year))
    properties = read_in_dataset('properties_{0}'.format(year))
    merged = merge_dataset(train, properties)
    
    if dset == 'train':
        merged = filter_duplicate_parcels(merged)
    
    y = merged.pop('logerror')
    return merged, y



In [24]:
X_train,y_train = get_data(dset='train')

In [25]:
X_train.head()

Unnamed: 0,parcelid,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,2016-01-01,1.0,,,2.0,3.0,,4.0,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,2016-01-01,,,,3.5,4.0,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,2016-01-01,1.0,,,3.0,2.0,,4.0,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,2016-01-02,1.0,,,2.0,2.0,,4.0,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,2016-01-02,,,,2.5,4.0,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


## Create the preprocessor object

In [27]:
#The Skeleton used 
class preprocessor:
    
    def __init__(self, cols_to_filter=None):
        
        self.cols_to_filter = cols_to_filter
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        return X_new

### Filter column

In [28]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None):
        
        self.cols_to_filter = cols_to_filter
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        
        #filter
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        return X_new

In [29]:
#removing parcelid by using preprocessor 
p=preprocessor(cols_to_filter = ['parcelid'])
p.transform(X_train).head()

Unnamed: 0,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,2016-01-01,1.0,,,2.0,3.0,,4.0,2.0,,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,2016-01-01,,,,3.5,4.0,,,3.5,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,2016-01-01,1.0,,,3.0,2.0,,4.0,3.0,,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,2016-01-02,1.0,,,2.0,2.0,,4.0,2.0,,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,2016-01-02,,,,2.5,4.0,,,2.5,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


### Deal with Datetime Columns

In [30]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None, datecols=None):
        
        self.cols_to_filter = cols_to_filter
        self.datecols = datecols
    
    def fit(self, X):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        return self
    
    def transform(self, X):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        if self.datecols:
            for x in self.datecols:
                X_new[x + '_month'] = pd.to_datetime(X_new[x]).apply(lambda x: x.month)
                X_new[x + '_year'] = pd.to_datetime(X_new[x]).apply(lambda x: x.year)
                X_new = X_new.drop(x, axis=1)
        
        return X_new

In [31]:
p=preprocessor(cols_to_filter = ['parcelid'], datecols=['transactiondate'])
X_train_transformed = p.transform(X_train)

In [32]:
X_train_transformed.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactiondate_month,transactiondate_year
0,1.0,,,2.0,3.0,,4.0,2.0,,,...,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0,1,2016
1,,,,3.5,4.0,,,3.5,,,...,346458.0,585529.0,2015.0,239071.0,10153.02,,,,1,2016
2,1.0,,,3.0,2.0,,4.0,3.0,,,...,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0,1,2016
3,1.0,,,2.0,2.0,,4.0,2.0,,,...,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0,1,2016
4,,,,2.5,4.0,,,2.5,,,...,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0,1,2016


## Define the imputation strategy

In [33]:
X_train.loc[:, X_train.isna().sum() > 0].min()

airconditioningtypeid                    1
architecturalstyletypeid                 2
basementsqft                           100
buildingclasstypeid                      4
buildingqualitytypeid                    1
calculatedbathnbr                        1
decktypeid                              66
finishedfloor1squarefeet                44
calculatedfinishedsquarefeet             2
finishedsquarefeet12                     2
finishedsquarefeet13                  1056
finishedsquarefeet15                   560
finishedsquarefeet50                    44
finishedsquarefeet6                    257
fireplacecnt                             1
fullbathcnt                              1
garagecarcnt                             0
garagetotalsqft                          0
hashottuborspa                        True
heatingorsystemtypeid                    1
lotsizesquarefeet                      167
poolcnt                                  1
poolsizesum                             28
pooltypeid1

SO, we can impute a -1 in place of missing value. Since, in training data there is no value less than ZERO.

In [34]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None, datecols=None):
        
        self.cols_to_filter = cols_to_filter
        self.datecols = datecols
    
    def fit(self, X):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        return self
    
    def transform(self, X):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        
        X_new = X.drop(self.cols_to_filter, axis=1)
        X_new = X_new.fillna(-1)
        
        if self.datecols:
            for x in self.datecols:
                X_new[x + '_month'] = pd.to_datetime(X_new[x]).apply(lambda x: x.month)
                X_new[x + '_year'] = pd.to_datetime(X_new[x]).apply(lambda x: x.year)
                X_new = X_new.drop(x, axis=1)
        
        return X_new

In [35]:
p=preprocessor(cols_to_filter = ['parcelid'], datecols=['transactiondate'])
X_train_transformed = p.transform(X_train)

In [36]:
X_train_transformed.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactiondate_month,transactiondate_year
0,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,2.0,-1.0,-1.0,...,122754.0,360170.0,2015.0,237416.0,6735.88,-1,-1.0,60371070000000.0,1,2016
1,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,3.5,-1.0,-1.0,...,346458.0,585529.0,2015.0,239071.0,10153.02,-1,-1.0,-1.0,1,2016
2,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,3.0,-1.0,-1.0,...,61994.0,119906.0,2015.0,57912.0,11484.48,-1,-1.0,60374640000000.0,1,2016
3,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,2.0,-1.0,-1.0,...,171518.0,244880.0,2015.0,73362.0,3048.74,-1,-1.0,60372960000000.0,1,2016
4,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,2.5,-1.0,-1.0,...,169574.0,434551.0,2015.0,264977.0,5488.96,-1,-1.0,60590420000000.0,1,2016


In [37]:
assert all(train_X_transformed.isna().sum() == 0) # nice!

## Encoding Categorical/Discrete Features

we can encode each categorical feature with a set of binary features in a process called "Dummy Coding."

In [68]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None, datecols=None):
        
        self.cols_to_filter = cols_to_filter
        self.datecols = datecols
        self.was_fit = False
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        self.was_fit = True
        X_new = X.drop(self.cols_to_filter, axis=1)
        categorical_features = X_new.dtypes[X_new.dtypes == 'object'].index
        self.categorical_features = [x for x in categorical_features if 'date' not in x]
        
        dummies = pd.get_dummies(X_new, columns = self.categorical_features, dummy_na = True)
        self.colnames = dummies.columns
        del dummies
        return self
    
    def transform(self, X):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        
        if not self.was_fit:
            raise Error("need to fit preprocessor first")
        
        #filter
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        X_new = pd.get_dummies(X_new, columns=self.categorical_features, dummy_na=True)
        newcols = set(self.colnames) - set(X_new.columns)
        for x in newcols:
            X_new[x] = 0
        X_new = X_new[self.colnames]
        
        #fill na after dummy code
        X_new = X_new.fillna(-1)
        
        if self.datecols:
            for x in self.datecols:
                X_new[x + '_month'] = pd.to_datetime(X_new[x]).apply(lambda x: x.month)
                X_new[x + '_year'] = pd.to_datetime(X_new[x]).apply(lambda x: x.year)
                X_new = X_new.drop(x, axis=1)
        
        return X_new
    def fit_transform(self, X, y= None):
        #fit and tansform method
        return self.fit(X).transform(X)

In [69]:
p=preprocessor(cols_to_filter = ['rawcensustractandblock', 'censustractandblock', 'propertyzoningdesc', 
                                 'regionidneighborhood', 'regionidzip', 'parcelid'], datecols=['transactiondate'])
p.fit(X_train)

<__main__.preprocessor at 0x195f920f7c8>

In [70]:
X_train_transformed = p.transform(X_train)

In [71]:
train_X_transformed.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactiondate_month,transactiondate_year
0,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,2.0,-1.0,-1.0,...,122754.0,360170.0,2015.0,237416.0,6735.88,-1,-1.0,60371070000000.0,1,2016
1,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,3.5,-1.0,-1.0,...,346458.0,585529.0,2015.0,239071.0,10153.02,-1,-1.0,-1.0,1,2016
2,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,3.0,-1.0,-1.0,...,61994.0,119906.0,2015.0,57912.0,11484.48,-1,-1.0,60374640000000.0,1,2016
3,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,2.0,-1.0,-1.0,...,171518.0,244880.0,2015.0,73362.0,3048.74,-1,-1.0,60372960000000.0,1,2016
4,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,2.5,-1.0,-1.0,...,169574.0,434551.0,2015.0,264977.0,5488.96,-1,-1.0,60590420000000.0,1,2016


In [72]:
assert all(train_X_transformed.isna().sum() == 0) # nice!