# Kaggle Zillow Preprocessing Final

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

%matplotlib inline

### Import Data

In [3]:
df_train = pd.read_csv('../data/train_2016_v2.csv')
prop = pd.read_csv('../data/properties_2016.csv')
sample = pd.read_csv('../data/sample_submission.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_train = pd.merge(df_train, prop, on='parcelid', how='left')

sample.rename(index=str, columns={'ParcelId': 'parcelid'}, inplace=True)
df_test = sample.merge(prop, on='parcelid', how='left')

print(df_test.shape)
print(df_train.shape)

(2985217, 64)
(90275, 60)


## Create new features

### Create features from 'transactiondate'

In [None]:
df_train['transactiondate'] =  pd.to_datetime(df_train['transactiondate'])
df_train['transaction_year'] = df_train.transactiondate.dt.year.astype(np.int16)
df_train['transaction_month'] = df_train.transactiondate.dt.month.astype(np.int8)
df_train['transaction_day'] = df_train.transactiondate.dt.weekday.astype(np.int8)

### Create features from 'rawcensustractandblock'
BLOCKID:  15-character code that is the concatenation of fields consisting of the 2-character state FIPS code, the 3-character county FIPS code, the 6-character census tract code, and the 4-character tabulation block code.

In [None]:
df_train['rawcensustractandblock_states'] = df_train.rawcensustractandblock.astype(str).apply(lambda x: x[:1]).astype(np.int8)
df_train['rawcensustractandblock_countries'] = df_train.rawcensustractandblock.astype(str).apply(lambda x: x[1:4]).astype(np.int8)
df_train['rawcensustractandblock_tracts'] = df_train.rawcensustractandblock.astype(str).apply(lambda x: x[4:11]).astype(np.float64)
df_train['rawcensustractandblock_blocks'] = df_train.rawcensustractandblock.astype(str).apply(lambda x: x[11:]).astype(np.int8)

### Create extra features

In [None]:
#--- how old is the house? ---
df_train['house_age'] = 2017 - df_train['yearbuilt']

#--- how many rooms are there? ---  
df_train['tot_rooms'] = df_train['bathroomcnt'] + df_train['bedroomcnt']

#--- does the house have A/C? ---
df_train['AC'] = np.where(df_train['airconditioningtypeid']>0, 1, 0)

#--- Does the house have a deck? ---
df_train['deck'] = np.where(df_train['decktypeid']>0, 1, 0)
df_train.drop('decktypeid', axis=1, inplace=True)

#--- does the house have a heating system? ---
df_train['heating_system'] = np.where(df_train['heatingorsystemtypeid']>0, 1, 0)

#--- does the house have a garage? ---
df_train['garage'] = np.where(df_train['garagecarcnt']>0, 1, 0)

#--- does the house come with a patio? ---
df_train['patio'] = np.where(df_train['yardbuildingsqft17']>0, 1, 0)

#--- does the house have a pool?
df_train['pooltypeid10'] = df_train.pooltypeid10.astype(np.int8)
df_train['pooltypeid7'] = df_train.pooltypeid7.astype(np.int8)
df_train['pooltypei2'] = df_train.pooltypeid2.astype(np.int8)
df_train['pool'] = df_train['pooltypeid10'] | df_train['pooltypeid7'] | df_train['pooltypeid2'] 

#--- does the house have all of these? -> spa/hot-tub/pool, A/C, heating system , garage, patio
df_train['exquisite'] = df_train['pool'] + df_train['patio'] + df_train['garage'] + df_train['heating_system'] + df_train['AC'] 

#--- Features based on location ---
df_train['x_loc'] = np.cos(df_train['latitude']) * np.cos(df_train['longitude'])
df_train['y_loc'] = np.cos(df_train['latitude']) * np.sin(df_train['longitude'])
df_train['z_loc'] = np.sin(df_train['latitude'])


### MEMORY CONSUMPTION
#### Let us look into the memory consumption of our dataframe and see if we can reduce it efficiently.

In [None]:
#--- Memory usage of entire dataframe ---
mem = df_train.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")

In [None]:
#--- List of columns that cannot be reduced in terms of memory size ---
count = 0
for col in df_train.columns:
    if df_train[col].dtype == object:
        count+=1
        print (col)
print('There are {} columns that cannot be reduced'.format(count))   

In [None]:
count = 0
for col in df_train.columns:
    if df_train[col].dtype != object:
        if((col != 'logerror')|(col != 'yearbuilt')|(col != 'xloc')|(col != 'yloc')|(col != 'zloc')):
            if ((df_train[col].max() < 255) & (df_train[col].min() > -255)):
                count+=1
                df_train[col] = df_train[col].astype(np.int8)
                print (col)
            if ((df_train[col].max() > 255) & (df_train[col].min() > -255)
               & (df_train[col].max() < 65535) & (df_train[col].min() > 0)):
                count+=1
                df_train[col] = df_train[col].astype(np.int16)
                print (col)
            if ((df_train[col].max() > 65535) & (df_train[col].min() > 0)
               & (df_train[col].max() < 4294967295) & (df_train[col].min() > 0)):
                count+=1
                df_train[col] = df_train[col].astype(np.int8)
                print (col)
print('There are {} columns reduced'.format(count)) 

In [None]:
#--- Let us check the memory consumed again ---
mem = df_train.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")