# 2.0 Combine Data from ```prop.csv```

Data has now been filtered for the parcel numbers in the existing dataset. The end goal is to have a dataset with one row for each parcel in which to build the model. **Imputation will occur in a separate notebook!**

However, we can't forget to explore the existing data for trends or insight along the way, as well as engineer features as we see fit.

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

%matplotlib inline
sns.set_context('notebook')

path = '/'.join(os.getcwd().split('/')[:-2])
print(path)

# load the data
tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'PIN': str})
cols = ['parcel','vacant','Date','Survey Category','Survey Grade','SPA_NAME']
tci = tci[cols]

/Volumes/Dropbox/largetransfer/luc/carter


In [3]:
tci.shape

(113132, 6)

In [4]:
# date range
rng = pd.date_range('10/1/2009', periods=23, freq='3MS')

In [5]:
# Residential characteristics
res = pd.read_csv(path+'/data/clean_data/res.csv')

cquals = {'Very poor': 1, 'Very poor+':2, 'Poor':3,'Poor+':4,'Average':5,'Average+':6,\
          'Good':7,'Good+':8,'Very good':9,'Very good+':10,'Excellent':11, 'Excellent+':12}
# quality
res['cqual_num'] = res[res.cqual.notnull()].apply(lambda x: cquals[x.cqual], axis=1)
# occupied
occup_dict = {'1 family':1, '2 family':2,'3 family':3,'4+family':4, '/':1}
def occup_fun(x):
    if x in occup_dict:
        return occup_dict[x]
    else: 
        return 1
res['occup'] = res['occup'].apply(lambda x: occup_fun(x))

res = res.set_index('parcel')

In [6]:
## THINGS THAT CAN BE DONE TO ENTIRE DATASETS
# Property characteristics
prop = pd.read_csv(path+'/data/clean_data/main_prop_all_years.csv')
prop = prop.drop_duplicates()

def parse_comma(x):
    if type(x) == str:
        x = int(x.replace(',',''))
    elif type(x) == float:
        if not np.isnan(x):
            x = int(x)
    return x
# Property size, total usable area
prop['propsize'] = prop['propsize'].apply(parse_comma)
prop['totusabl'] = prop['totusabl'].apply(parse_comma)
# Condition value
condition_value = {'Unsound': 0, 'Sound value (c)': 1, 'Very poor': 2, 'Poor': 3, 'Fair': 4, 'Average': 5, \
                   'Good': 6, 'Very good': 7, 'Excellent': 8,}
# no need to sort by neighborhood, all have median of average
prop['cond_val'] = prop.loc[prop.condition.notnull(),'condition'].apply(lambda x: condition_value[x] if x else 5)

# style
styles = ['Bungalow','CAP','Colonial','Ranch']
def get_style(x):
    if x in styles:
        return x
    else:
        return 'Other'
prop['style_filtered'] = prop['style'].apply(get_style)

# owner occupied value, median value for all neighborhoods is 1
def get_own(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return 1 
prop['ownerocc_value'] = prop['ownerocc'].apply(get_own)
# year built
def get_year(x):
    try:
        x = int(x)
    except ValueError:
        x = np.nan
    return x
prop['yrbuilt_filtered'] = prop['yrbuilt'].apply(get_year)

prop = prop.set_index('parcel')
prop = prop.sort_values('taxyr')

In [7]:
# for each date
for date in rng: 
    print(date)
    # merge residential characteristics
    df = pd.merge(tci, res.loc[res.year==(date.year-1),['cqual_num','occup','rnumstor']],\
                  how='left', left_on='parcel', right_index=True)
    # merge property characteristics
    df = pd.merge(df, prop.loc[prop.taxyr==(date.year-1),['propsize','totusabl','tmktval','mktbldg',\
                  'mktland','cond_val','style_filtered','ownerocc_value',\
                  'totbldgs','yrbuilt_filtered']],\
                  how='left', left_on='parcel', right_index=True)
    # fill null values with medians from neighborhood
    for var in ['propsize','totusabl','tmktval','mktbldg','mktland','yrbuilt_filtered',\
                'cqual_num','occup','rnumstor','cond_val']:
        # get the medians by neighborhood
        median_val = dict(df.loc[df[var]>=0,[var,'SPA_NAME']].groupby('SPA_NAME').agg(np.median)[var])
        # apply medians to null value
        df.loc[df[var].isnull(), var] = df.loc[df[var].isnull()].apply(lambda x: median_val[x.SPA_NAME], axis=1)
        if var in ['propsize','totusabl','tmktval','mktbldg','mktland']:
            df.loc[df[var]==0, var] = df.loc[df[var]==0].apply(lambda x: median_val[x.SPA_NAME], axis=1) 
#         df.loc[df[var]==0, var] = df.loc[df[var]==0].apply(lambda x: median_val[x.SPA_NAME], axis=1) 
    # Owner occupancy
    df.loc[df['ownerocc_value'].isnull(),'ownerocc_value'] = 1
    # Style
    df.loc[df['style_filtered'].isnull(),'style_filtered'] = 'Other'
    # Total buildings
    df.loc[df.totbldgs.isnull(),'totbldgs'] = 1
    # Output
    df.to_csv(path+'/data/model_data/tci_2_0_'+str(date.year)+'_'+str(date.month)+'.csv', index=False)

2009-10-01 00:00:00
2010-01-01 00:00:00
2010-04-01 00:00:00
2010-07-01 00:00:00
2010-10-01 00:00:00
2011-01-01 00:00:00
2011-04-01 00:00:00
2011-07-01 00:00:00
2011-10-01 00:00:00
2012-01-01 00:00:00
2012-04-01 00:00:00
2012-07-01 00:00:00
2012-10-01 00:00:00
2013-01-01 00:00:00
2013-04-01 00:00:00
2013-07-01 00:00:00
2013-10-01 00:00:00
2014-01-01 00:00:00
2014-04-01 00:00:00
2014-07-01 00:00:00
2014-10-01 00:00:00
2015-01-01 00:00:00
2015-04-01 00:00:00


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 19 columns):
parcel              113132 non-null object
vacant              113132 non-null int64
Date                113132 non-null datetime64[ns]
Survey Category     113132 non-null object
Survey Grade        113132 non-null object
SPA_NAME            113132 non-null object
cqual_num           113132 non-null float64
occup               113132 non-null float64
rnumstor            113132 non-null float64
propsize            113132 non-null float64
totusabl            113132 non-null float64
tmktval             113132 non-null float64
mktbldg             113132 non-null float64
mktland             113132 non-null float64
cond_val            113132 non-null float64
style_filtered      113132 non-null object
ownerocc_value      113132 non-null float64
totbldgs            113132 non-null float64
yrbuilt_filtered    113132 non-null float64
dtypes: datetime64[ns](1), float64(12), int64(1), obj