# 2.0 Combine Data from ```prop.csv```

Data has now been filtered for the parcel numbers in the existing dataset. The end goal is to have a dataset with one row for each parcel in which to build the model. **Imputation will occur in a separate notebook!**

However, we can't forget to explore the existing data for trends or insight along the way, as well as engineer features as we see fit.

In [46]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np

tci = pd.read_csv('original_data/tci.csv')
ppns = set(tci['Parcel Number'])
def vacant(x):
    if x in ['Vacant Structure Open', 'Vacant Structure Secure']:
        return 1
    else:
        return 0
tci['vacant'] = tci.Category.apply(vacant)

t

In [54]:
tci.columns = [u'parcel', u'House Number', u'Street Name', u'Category', u'Survey Date', u'vacant']

## Add data from property characteristics

In [55]:
prop = pd.read_csv('clean_data/main_prop_tci.csv')
prop = prop.drop_duplicates()

In [56]:
prop.columns[[5,34,36]]

Index([u'zip', u'glsflag', u'gtxyr'], dtype='object')

In [57]:
prop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186908 entries, 0 to 186946
Data columns (total 65 columns):
parcel       186908 non-null object
taxyr        186856 non-null float64
pclass       186906 non-null object
nluc         184008 non-null float64
luc          186908 non-null object
zip          146654 non-null object
owner        186849 non-null object
propsize     182302 non-null object
front        186562 non-null float64
depth        184954 non-null float64
lotshape     152178 non-null object
totbldgs     186468 non-null float64
condition    164074 non-null object
units        163390 non-null float64
area         163992 non-null object
totusabl     181862 non-null object
yrbuilt      164060 non-null float64
style        160978 non-null object
rextwall     160959 non-null object
rooms        160930 non-null float64
bedrooms     160952 non-null float64
baths        160959 non-null float64
halfbath     160955 non-null float64
bval         185536 non-null object
lval         1

### Zip code

In [58]:
def filter_zip(x):
    try:
        x = str(int(x))
    except ValueError:
        pass
    return x

In [59]:
prop['zip'] = prop['zip'].dropna().apply(filter_zip)
zips = set(prop['zip'].dropna())
zips.difference_update(['.','0','4','441041'])
prop_zips = prop[['parcel','zip']][prop['zip'].isin(zips)].groupby('parcel').first()

In [60]:
tci = pd.merge(tci, prop_zips, how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 7)


In [61]:
tci[['vacant','zip']].groupby('zip').agg([sum,len])

Unnamed: 0_level_0,vacant,vacant
Unnamed: 0_level_1,sum,len
zip,Unnamed: 1_level_2,Unnamed: 2_level_2
44104,475,3935
44105,237,1622
44106,3,20
44112,0,1
44114,0,1
44120,737,6637
44121,0,1
44126,1,11
44128,89,974
44129,1,1


### Property size

In [62]:
set(prop['propsize'].apply(type))

{float, str}

In [63]:
def parse_comma(x):
    if type(x) == str:
        x = int(x.replace(',',''))
    elif type(x) == float:
        if not np.isnan(x):
            x = int(x)
        else:
            pass
    else:
        pass
    return x

In [64]:
prop['propsize'] = prop['propsize'].apply(parse_comma)
parcel_size = prop[prop['propsize'].notnull()][['parcel','propsize']].sort('propsize', ascending=False).groupby('parcel').first()

In [65]:
tci = pd.merge(tci, parcel_size, how='left', left_on='parcel', right_index=True)
print tci.head()

       parcel House Number  Street Name            Category Survey Date  \
0  121-31-003         2427  WOODHILL RD  Occupied Structure  2014-03-18   
1  121-31-005          NaN          NaN          Vacant Lot  2014-03-18   
2  121-31-008          NaN          NaN          Vacant Lot  2014-03-18   
3  121-31-012         2458   BALDWIN RD          Vacant Lot  2014-03-18   
4  121-31-013         8685   BALDWIN RD          Vacant Lot  2014-03-18   

   vacant    zip  propsize  
0       0  44104      8155  
1       0    NaN       NaN  
2       0  44104      3605  
3       0  44104      5544  
4       0  44104         0  


In [66]:
print len(tci[tci['propsize'].isnull() & tci['zip'].isnull()])

253


So every parcel that is missing a property size is also missing a zip code! And a pclass!

### pclass

In [67]:
set(prop['pclass'])

{nan,
 'Commerc. Exempt',
 'Commercial',
 'Exempt',
 'Highway',
 'Industrial',
 'Land bank',
 'Residential',
 'Residential Exempt'}

In [68]:
pclass = prop[prop['pclass'].notnull()][['parcel','pclass']].groupby('parcel').first()
tci = pd.merge(tci, pclass, how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 9)


In [69]:
tci[['vacant','pclass']].groupby('pclass').agg([sum,len])

Unnamed: 0_level_0,vacant,vacant
Unnamed: 0_level_1,sum,len
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
Commerc. Exempt,0,5
Commercial,9,172
Exempt,8,318
Industrial,0,4
Land bank,3,501
Residential,1511,12122
Residential Exempt,12,123


### Total usable area

In [70]:
prop['totusabl'] = prop['totusabl'].apply(parse_comma)
usable_area = prop[prop['totusabl'].notnull()][['parcel','totusabl']].sort('totusabl', ascending=False).groupby('parcel').first()
tci = pd.merge(tci, usable_area, how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 10)


### Total Market Value

In [71]:
tmktval = prop[['tmktval','parcel']].groupby('parcel').median()
tci = pd.merge(tci, tmktval, how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 11)


In [72]:
tci.head()

Unnamed: 0,parcel,House Number,Street Name,Category,Survey Date,vacant,zip,propsize,pclass,totusabl,tmktval
0,121-31-003,2427.0,WOODHILL RD,Occupied Structure,2014-03-18,0,44104.0,8155.0,Residential,0.0,2600.0
1,121-31-005,,,Vacant Lot,2014-03-18,0,,,,,
2,121-31-008,,,Vacant Lot,2014-03-18,0,44104.0,3605.0,Residential,0.0,2800.0
3,121-31-012,2458.0,BALDWIN RD,Vacant Lot,2014-03-18,0,44104.0,5544.0,Residential,1584.0,2550.0
4,121-31-013,8685.0,BALDWIN RD,Vacant Lot,2014-03-18,0,44104.0,0.0,Residential,0.0,3400.0


### Condition

In [73]:
set(prop['condition'])

{nan,
 'Average',
 'Fair',
 'Good',
 'Poor',
 'Sound value (c)',
 'Unsound',
 'Very good',
 'Very poor'}

In [74]:
prop[prop.taxyr==2013][['parcel','condition']].groupby('parcel').first().head()

Unnamed: 0_level_0,condition
parcel,Unnamed: 1_level_1
121-31-003,
121-31-008,
121-31-012,
121-31-013,
121-31-015,Average


In [75]:
tci = pd.merge(tci, prop[prop.taxyr==2013][['parcel','condition']].groupby('parcel').first(), how='left', left_on='parcel', right_index=True)

In [76]:
tci.shape

(13500, 12)

In [77]:
# unsound
# very poor
# poor
# fair
# average
# good
# very good
# sound

condition_value = {'Unsound': 0, 'Very poor': 1, 'Poor': 2, 'Fair': 3, 'Average': 4, \
                   'Good': 5, 'Very good': 6, 'Sound value (c)': 7}
prop.loc[prop['condition'].notnull(),'condition_value'] = prop.loc[prop['condition'].notnull(),'condition'].apply(lambda x: condition_value[x])
tci = pd.merge(tci, prop[prop.taxyr==2013][['parcel','condition_value']].groupby('parcel').first() \
               , how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 13)


In [78]:
tci.loc[tci.pclass=='Residential'][['vacant','condition_value']].groupby('condition_value').agg([sum,len])

Unnamed: 0_level_0,vacant,vacant
Unnamed: 0_level_1,sum,len
condition_value,Unnamed: 1_level_2,Unnamed: 2_level_2
0,81,123
1,126,219
2,330,909
3,586,3982
4,333,4531
5,36,857
6,2,25


In [79]:
t = tci[['condition_value','vacant']].groupby('condition_value').agg([len, sum])
t.vacant['sum']/t.vacant['len']

condition_value
0                  0.648000
1                  0.549784
2                  0.360814
3                  0.145607
4                  0.074114
5                  0.041712
6                  0.074074
7                  0.000000
dtype: float64

### Housing style

In [80]:
styles = ['Bungalow','Cape Cod','Colonial','Ranch']

def get_style(x):
    if x in styles:
        return x
    else:
        return np.nan

prop['style_filtered'] = prop['style'].apply(get_style)

In [81]:
tci = pd.merge(tci, prop[prop.taxyr==2013][['parcel','style_filtered']].groupby('parcel').first() \
               , how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 14)


In [82]:
tci.loc[tci.pclass=='Residential'][['vacant','style_filtered']].groupby('style_filtered').agg([sum,len])

Unnamed: 0_level_0,vacant,vacant
Unnamed: 0_level_1,sum,len
style_filtered,Unnamed: 1_level_2,Unnamed: 2_level_2
Bungalow,35,218
Cape Cod,197,1526
Colonial,1216,8513
Ranch,45,354


In [83]:
sum(tci.vacant)/float(len(tci))

0.11518518518518518

### Owner occupancy

In [84]:
def get_own(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return np.nan
    
prop['ownerocc_value'] = prop['ownerocc'].apply(get_own)

In [85]:
tci = pd.merge(tci, prop[prop.taxyr==2013][['parcel','ownerocc_value']].groupby('parcel').first(), \
               how='left', left_on='parcel', right_index=True)

In [86]:
tci.shape

(13500, 15)

In [87]:
t = tci[['ownerocc_value','vacant']].groupby('ownerocc_value').agg([len, sum])
t.vacant['sum']/t.vacant['len']

ownerocc_value
0                 0.148204
1                 0.084161
dtype: float64

### Total buildings

In [88]:
tci = pd.merge(tci, prop[prop.taxyr==2013][['parcel','totbldgs']].groupby('parcel').first(), \
               how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 16)


In [89]:
tci.head()

Unnamed: 0,parcel,House Number,Street Name,Category,Survey Date,vacant,zip,propsize,pclass,totusabl,tmktval,condition,condition_value,style_filtered,ownerocc_value,totbldgs
0,121-31-003,2427.0,WOODHILL RD,Occupied Structure,2014-03-18,0,44104.0,8155.0,Residential,0.0,2600.0,,,,0.0,
1,121-31-005,,,Vacant Lot,2014-03-18,0,,,,,,,,,,
2,121-31-008,,,Vacant Lot,2014-03-18,0,44104.0,3605.0,Residential,0.0,2800.0,,,,0.0,0.0
3,121-31-012,2458.0,BALDWIN RD,Vacant Lot,2014-03-18,0,44104.0,5544.0,Residential,1584.0,2550.0,,,,0.0,0.0
4,121-31-013,8685.0,BALDWIN RD,Vacant Lot,2014-03-18,0,44104.0,0.0,Residential,0.0,3400.0,,,,0.0,


### Year built

In [90]:
def get_year(x):
    try:
        x = int(x)
    except ValueError:
        x = np.nan
    return x

In [91]:
prop['yrbuilt_filtered'] = prop['yrbuilt'].apply(get_year)

In [92]:
tci = pd.merge(tci, prop[prop.taxyr==2013][['parcel','yrbuilt_filtered']].groupby('parcel').first(), \
               how='left', left_on='parcel', right_index=True)
tci.shape

(13500, 17)

### Exterior Wall

In [93]:
tci = pd.merge(tci, prop[prop.taxyr==2013][['parcel','rextwall']].groupby('parcel').first(), \
               how='left', left_on='parcel', right_index=True)
print tci.shape

(13500, 18)


In [94]:
tci.head()

Unnamed: 0,parcel,House Number,Street Name,Category,Survey Date,vacant,zip,propsize,pclass,totusabl,tmktval,condition,condition_value,style_filtered,ownerocc_value,totbldgs,yrbuilt_filtered,rextwall
0,121-31-003,2427.0,WOODHILL RD,Occupied Structure,2014-03-18,0,44104.0,8155.0,Residential,0.0,2600.0,,,,0.0,,,
1,121-31-005,,,Vacant Lot,2014-03-18,0,,,,,,,,,,,,
2,121-31-008,,,Vacant Lot,2014-03-18,0,44104.0,3605.0,Residential,0.0,2800.0,,,,0.0,0.0,,
3,121-31-012,2458.0,BALDWIN RD,Vacant Lot,2014-03-18,0,44104.0,5544.0,Residential,1584.0,2550.0,,,,0.0,0.0,,
4,121-31-013,8685.0,BALDWIN RD,Vacant Lot,2014-03-18,0,44104.0,0.0,Residential,0.0,3400.0,,,,0.0,,,


In [95]:
prop[prop.taxyr==2013].shape

(13344, 69)

In [96]:
tci.to_csv('model_data/tci_with_prop.csv', index=False)