# 2.0 Combine Data from ```prop.csv```

Data has now been filtered for the parcel numbers in the existing dataset. The end goal is to have a dataset with one row for each parcel in which to build the model. **Imputation will occur in a separate notebook!**

However, we can't forget to explore the existing data for trends or insight along the way, as well as engineer features as we see fit.

In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

%matplotlib inline
sns.set_context('notebook')

path = '/'.join(os.getcwd().split('/')[:-1])
print(path)

/Volumes/Dropbox/largetransfer/luc/carter


In [26]:
tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'PIN': str})
cols = ['parcel','vacant','Date','Survey Category','Survey Grade','SPA_NAME']
tci = tci[cols]
tci.shape 

(113132, 6)

## Clean and merge data from property characteristics

In [27]:
# prop13 = pd.read_csv(path+'/data/clean_data/main_prop_filtered14.csv')
# prop13 = prop13.drop_duplicates()
# prop13 = prop13.set_index('parcel')

In [28]:
prop = pd.read_csv(path+'/data/clean_data/main_prop_filtered.csv')
prop = prop.drop_duplicates()
prop = prop.groupby('parcel').last()

### Property size

In [29]:
def parse_comma(x):
    if type(x) == str:
        x = int(x.replace(',',''))
    elif type(x) == float:
        if not np.isnan(x):
            x = int(x)
        else:
            pass
    else:
        pass
    return x

prop['propsize'] = prop['propsize'].apply(parse_comma)

tci = pd.merge(tci, prop[['propsize']], how='left', left_on='parcel', right_index=True)
tci.loc[tci.propsize.isnull(),'propsize'] = np.median(tci.loc[tci.propsize.notnull(),'propsize'])
print(tci.shape)

(113132, 7)


In [30]:
def reject_outliers(data, m=1):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

### Total usable area

In [31]:
prop['totusabl'] = prop['totusabl'].apply(parse_comma)
usable_area = prop[['totusabl']]

tci = pd.merge(tci, usable_area, how='left', left_on='parcel', right_index=True)

tci.loc[tci.totusabl.isnull(),'totusabl'] = np.median(tci.loc[tci.totusabl.notnull(),'totusabl'])
print(tci.shape)

(113132, 8)


In [32]:
prop.columns

Index(['Unnamed: 0', 'taxyr', 'pclass', 'nluc', 'luc', 'zip', 'owner',
       'propsize', 'front', 'depth', 'lotshape', 'totbldgs', 'condition14',
       'units', 'area', 'totusabl', 'yrbuilt', 'style', 'rextwall', 'rooms',
       'bedrooms', 'baths', 'halfbath', 'bval', 'lval', 'tval', 'ebval',
       'elval', 'etval', 'tbval', 'tlval', 'mktval', 'delyr', 'deltax',
       'glsflag', 'gtxdel', 'gtxyr', 'tmktval', 'mktland', 'mktbldg',
       'ownerocc', 'red25', 'homestd', 'intax', 'incom', 'inres', 'taxabate',
       'inpar', 'aluc', 'atval', 'abval', 'alval', 'tifluc', 'tiftval',
       'tifbval', 'tiflval', 'inhis', 'diffval', '_TYPE_', '_FREQ_',
       'chg_tval', 'PROP', 'LATITUDE', 'LONGITUDE', 'yearkey', 'condition13'],
      dtype='object')

In [33]:
median_usable = dict(tci[tci.totusabl>0].groupby('SPA_NAME').agg(np.median)['totusabl'])
tci.loc[tci.totusabl==0, 'totusabl'] = tci[tci.totusabl==0].apply(lambda x: median_usable[x.SPA_NAME], axis=1)

### Total market value

In [34]:
tci = pd.merge(tci, prop[['tmktval']], how='left', left_on='parcel', right_index=True)

tci.loc[tci.tmktval.isnull(),'tmktval'] = np.median(tci.loc[tci.tmktval.notnull(),'tmktval'])
print(tci.shape)

(113132, 9)


In [35]:
median_val = dict(tci[tci.tmktval>0].groupby('SPA_NAME').agg(np.median)['tmktval'])
tci.loc[tci.tmktval==0, 'tmktval'] = tci[tci.tmktval==0].apply(lambda x: median_val[x.SPA_NAME], axis=1)

### Condition

In [36]:
# unsound
# very poor
# poor
# fair
# average
# good
# very good
# excellent
# sound

tci = pd.merge(tci, prop[['condition13']], how='left', left_on='parcel', right_index=True)
tci = pd.merge(tci, prop[['condition14']], how='left', left_on='parcel', right_index=True)

condition_value = {'Unsound': 0, 'Sound value (c)': 1, 'Very poor': 2, 'Poor': 3, 'Fair': 4, 'Average': 5, \
                   'Good': 6, 'Very good': 7, 'Excellent': 8,}

prop.loc[prop['condition13'].notnull(),'condition_value13'] = \
    prop.loc[prop['condition13'].notnull(),'condition13'].apply(lambda x: condition_value[x])
    
tci = pd.merge(tci, prop[['condition_value13']], how='left', left_on='parcel', right_index=True)
# tci.loc[tci.condition_value13.isnull(),'condition_value13'] = \
#     np.median(tci.loc[tci.condition_value13.notnull(),'condition_value13'])


prop.loc[prop['condition14'].notnull(),'condition_value14'] = \
    prop.loc[prop['condition14'].notnull(),'condition14'].apply(lambda x: condition_value[x])
    
tci = pd.merge(tci, prop[['condition_value14']], how='left', left_on='parcel', right_index=True)
# tci.loc[tci.condition_value14.isnull(),'condition_value14'] = \
#     np.median(tci.loc[tci.condition_value14.notnull(),'condition_value14'])
    
print(tci.shape)

(113132, 13)


In [37]:
mean_cond = dict(tci[tci.condition_value13>0].groupby('SPA_NAME').agg(np.mean)['condition_value13'])
tci.loc[tci.condition_value13.isnull(), 'condition_value13'] = tci[tci.condition_value13.isnull()].apply(lambda x: mean_cond[x.SPA_NAME], axis=1)

In [38]:
mean_cond = dict(tci[tci.condition_value14>0].groupby('SPA_NAME').agg(np.mean)['condition_value14'])
tci.loc[tci.condition_value14.isnull(), 'condition_value14'] = tci[tci.condition_value14.isnull()].apply(lambda x: mean_cond[x.SPA_NAME], axis=1)

In [39]:
tci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 13 columns):
parcel               113132 non-null object
vacant               113132 non-null int64
Date                 113132 non-null datetime64[ns]
Survey Category      113132 non-null object
Survey Grade         113132 non-null object
SPA_NAME             113132 non-null object
propsize             113132 non-null float64
totusabl             113132 non-null float64
tmktval              113132 non-null float64
condition13          112559 non-null object
condition14          112595 non-null object
condition_value13    113132 non-null float64
condition_value14    113132 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(1), object(6)
memory usage: 12.1+ MB


### Housing style

In [40]:
prop.groupby('style').count().sort_values('zip', ascending=False).head(7)

Unnamed: 0_level_0,Unnamed: 0,taxyr,pclass,nluc,luc,zip,owner,propsize,front,depth,...,_TYPE_,_FREQ_,chg_tval,PROP,LATITUDE,LONGITUDE,yearkey,condition13,condition_value13,condition_value14
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Colonial,67227,67227,67227,66841,67227,67227,67227,66430,67057,66505,...,0,0,67227,67227,67208,67208,67227,67182,67182,67227
CAP,27169,27169,27169,27060,27169,27169,27166,27090,27129,27095,...,0,0,27169,27169,27164,27164,27169,27163,27163,27169
Ranch,10896,10896,10896,10801,10896,10896,10896,10807,10873,10823,...,0,0,10896,10896,10895,10895,10896,10890,10890,10896
Bungalow,4860,4860,4860,4837,4860,4860,4860,4832,4843,4827,...,0,0,4860,4860,4860,4860,4860,4859,4859,4860
Townhouse,817,817,817,816,817,817,817,747,799,788,...,0,0,817,817,619,619,817,783,783,817
Split level,561,561,561,561,561,561,561,561,559,561,...,0,0,561,561,561,561,561,561,561,561
Contemporary,128,128,128,128,128,128,128,122,128,125,...,0,0,128,128,128,128,128,128,128,128


In [41]:
styles = ['Bungalow','CAP','Colonial','Ranch']

def get_style(x):
    if x in styles:
        return x
    else:
        return np.nan

prop['style_filtered'] = prop['style'].apply(get_style)

In [42]:
tci = pd.merge(tci, prop[['style_filtered']] \
               , how='left', left_on='parcel', right_index=True)
tci.loc[tci.style_filtered.isnull(),'style_filtered'] = 'Other'

print(tci.shape)

(113132, 14)


In [43]:
tci[['vacant','style_filtered']].groupby('style_filtered').agg([sum,len])

Unnamed: 0_level_0,vacant,vacant
Unnamed: 0_level_1,sum,len
style_filtered,Unnamed: 1_level_2,Unnamed: 2_level_2
Bungalow,359,4860
CAP,1797,27169
Colonial,7044,67227
Other,319,2980
Ranch,519,10896


### Owner occupancy

In [44]:
def get_own(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return 1
    
prop['ownerocc_value'] = prop['ownerocc'].apply(get_own)

In [45]:
tci = pd.merge(tci, prop[['ownerocc_value']], \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci['ownerocc_value'].isnull(),'ownerocc_value'] = 1
print(tci.shape)

(113132, 15)


### Latitude/Longitude

In [46]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [47]:
tci = pd.merge(tci, prop[['LATITUDE','LONGITUDE']], \
               how='left', left_on='parcel', right_index=True)
# tci.loc[tci.totbldgs.isnull(),'totbldgs'] = np.median(tci.loc[tci.totbldgs.notnull(),'totbldgs'])
# tci.loc[tci.totbldgs.isnull(),'totbldgs'] = 1
print(tci.shape)

(113132, 17)


In [48]:
# %%timeit
# dist = dict()
# for spa in set(tci.SPA_NAME):
#     dist[spa] = dict()
#     for i, parcel1 in enumerate(tci[tci.SPA_NAME==spa].parcel):
#         dist[spa][parcel1] = dict()
#         for j, parcel2 in enumerate(tci[tci.SPA_NAME==spa].parcel.iloc[i+1:]):
#             p1 = tci[tci.parcel==parcel1]
#             p2 = tci[tci.parcel==parcel2]
#             dist[spa][parcel1][parcel2] = haversine(p1.LONGITUDE, p1.LATITUDE, p2.LONGITUDE, p2.LATITUDE)
#             break
#         break
#     break

### Total buildings

In [49]:
tci = pd.merge(tci, prop[['totbldgs']], \
               how='left', left_on='parcel', right_index=True)
# tci.loc[tci.totbldgs.isnull(),'totbldgs'] = np.median(tci.loc[tci.totbldgs.notnull(),'totbldgs'])
tci.loc[tci.totbldgs.isnull(),'totbldgs'] = 1
print(tci.shape)

(113132, 18)


### Year built

In [50]:
def get_year(x):
    try:
        x = int(x)
    except ValueError:
        x = np.nan
    return x

In [51]:
prop['yrbuilt_filtered'] = prop['yrbuilt'].apply(get_year)
tci = pd.merge(tci, prop[['yrbuilt_filtered']], \
               how='left', left_on='parcel', right_index=True)
# tci.loc[tci.yrbuilt_filtered.isnull(),'yrbuilt_filtered'] = np.median(tci.loc[tci.yrbuilt_filtered.notnull(),'yrbuilt_filtered'])
print(tci.shape)

(113132, 19)


In [52]:
median_val = dict(tci[tci.yrbuilt_filtered.notnull()].groupby('SPA_NAME').agg(np.median)['yrbuilt_filtered'])
tci.loc[tci.yrbuilt_filtered.isnull(), 'yrbuilt_filtered'] = tci[tci.yrbuilt_filtered.isnull()].apply(lambda x: median_val[x.SPA_NAME], axis=1)

In [53]:
tci[tci.yrbuilt_filtered.isnull()].shape

(0, 19)

In [54]:
tci.groupby('SPA_NAME').agg(np.median)['yrbuilt_filtered']

SPA_NAME
Bellaire-Puritas           1950.0
Broadway-Slavic Village    1910.0
Brooklyn Centre            1908.0
Buckeye-Shaker Square      1920.0
Buckeye-Woodhill           1915.0
Central                    1997.0
Clark-Fulton               1900.0
Collinwood-Nottingham      1919.0
Cudell                     1910.0
Cuyahoga Valley            1944.5
Detroit Shoreway           1905.0
Downtown                   2007.0
Edgewater                  1920.0
Euclid-Green               1930.0
Fairfax                    1910.0
Glenville                  1915.0
Goodrich-Kirtland Pk       1895.0
Hopkins                    1952.0
Hough                      1905.0
Jefferson                  1928.0
Kamm's                     1942.0
Kinsman                    1920.0
Lee-Harvard                1949.0
Lee-Seville                1955.0
Mount Pleasant             1924.0
North Shore Collinwood     1924.0
Ohio City                  1900.0
Old Brooklyn               1928.0
St.Clair-Superior          1905.0
Stock

### Exterior Wall

In [55]:
tci = pd.merge(tci, prop[['rextwall']], \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci.rextwall.isnull(),'rextwall'] = 'Aluminum/Vinyl (r)'
tci['rextwall'] = tci.rextwall.apply(lambda x: x if x in ['Aluminum/Vinyl (r)','Brick','Frame (r)'] else 'Other')

print(tci.shape)

(113132, 20)


## Add demographic variables

In [56]:
dem = pd.read_csv(path+'/data/clean_data/demographic.csv', dtype=str)

In [57]:
new_cols = ['parcel','NAME10','vacant_block','owner','renter', 'median_rent','property_crimes','burglaries','other_drug', 'part_one_crimes','part_two_crimes', \
'bachelors+','poverty_rate','median_hh_income', 'median_fam_income','white','black','asian','other','hispanic','young','middle','old']

use_cols = ['vacant_block', 'NAME10','median_rent','property_crimes','burglaries', 'part_one_crimes','part_two_crimes', \
'bachelors+','poverty_rate','median_hh_income','white','black','asian','other','hispanic','young','middle','old']

dem.columns = new_cols

In [58]:
cols = dem.dtypes[dem.dtypes=='object'].index[1:]

for col in cols:
    print(col, len(dem.loc[dem[col]=='null', col]))
    dem.loc[dem[col]=='null', col] = np.nan
    dem.loc[dem[col].notnull(), col] = dem.loc[dem[col].notnull(), col].apply(float)

NAME10 0
vacant_block 487
owner 0
renter 0
median_rent 487
property_crimes 0
burglaries 0
other_drug 0
part_one_crimes 0
part_two_crimes 0
bachelors+ 487
poverty_rate 487
median_hh_income 487
median_fam_income 487
white 487
black 487
asian 487
other 487
hispanic 487
young 487
middle 487
old 487


In [59]:
dem = dem.set_index('parcel')
tci = pd.merge(tci, dem[use_cols], left_on='parcel',right_index=True, how='left')

In [60]:
for col in use_cols:
    tci[col] = tci[col].apply(float)
    median_val = dict(tci[tci[col].notnull()].groupby('SPA_NAME').agg(np.median)[col])
    tci.loc[tci[col].isnull(), col] = tci[tci[col].isnull()].apply(lambda x: median_val[x.SPA_NAME], axis=1)
#     tci.loc[tci[col].isnull(),col] = np.median(tci.loc[tci[col].notnull(),col])

In [61]:
tci.groupby('NAME10').count()

Unnamed: 0_level_0,parcel,vacant,Date,Survey Category,Survey Grade,SPA_NAME,propsize,totusabl,tmktval,condition13,...,poverty_rate,median_hh_income,white,black,asian,other,hispanic,young,middle,old
NAME10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1011.01,301,301,301,301,301,301,301,301,301,299,...,301,301,301,301,301,301,301,301,301,301
1011.02,907,907,907,907,907,907,907,907,907,903,...,907,907,907,907,907,907,907,907,907,907
1012.00,388,388,388,388,388,388,388,388,388,379,...,388,388,388,388,388,388,388,388,388,388
1013.00,199,199,199,199,199,199,199,199,199,198,...,199,199,199,199,199,199,199,199,199,199
1014.00,643,643,643,643,643,643,643,643,643,643,...,643,643,643,643,643,643,643,643,643,643
1015.01,562,562,562,562,562,562,562,562,562,560,...,562,562,562,562,562,562,562,562,562,562
1016.03,663,663,663,663,663,663,663,663,663,661,...,663,663,663,663,663,663,663,663,663,663
1017.00,584,584,584,584,584,584,584,584,584,583,...,584,584,584,584,584,584,584,584,584,584
1018.00,653,653,653,653,653,653,653,653,653,649,...,653,653,653,653,653,653,653,653,653,653
1019.01,456,456,456,456,456,456,456,456,456,435,...,456,456,456,456,456,456,456,456,456,456


# Output

In [62]:
tci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 38 columns):
parcel               113132 non-null object
vacant               113132 non-null int64
Date                 113132 non-null datetime64[ns]
Survey Category      113132 non-null object
Survey Grade         113132 non-null object
SPA_NAME             113132 non-null object
propsize             113132 non-null float64
totusabl             113132 non-null float64
tmktval              113132 non-null float64
condition13          112559 non-null object
condition14          112595 non-null object
condition_value13    113132 non-null float64
condition_value14    113132 non-null float64
style_filtered       113132 non-null object
ownerocc_value       113132 non-null float64
LATITUDE             112857 non-null float64
LONGITUDE            112857 non-null float64
totbldgs             113132 non-null float64
yrbuilt_filtered     113132 non-null float64
rextwall             113132 non-null

In [63]:
tci.to_csv(path+'/data/model_data/tci_2_0.csv', index=False)