# 2.0 Combine Data from ```prop.csv```

Data has now been filtered for the parcel numbers in the existing dataset. The end goal is to have a dataset with one row for each parcel in which to build the model. **Imputation will occur in a separate notebook!**

However, we can't forget to explore the existing data for trends or insight along the way, as well as engineer features as we see fit.

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

%matplotlib inline
sns.set_context('notebook')

path = '/'.join(os.getcwd().split('/')[:-1])
print(path)

tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'PIN': str})
cols = ['parcel','vacant','Date','Survey Category','Survey Grade','SPA_NAME']
tci = tci[cols]
tci.shape 

/Volumes/Dropbox/largetransfer/luc/carter




## Residential characteristics

In [3]:
res = pd.read_csv(path+'/data/clean_data/res.csv')
res = res.set_index('parcel')

In [6]:
res.columns

Index(['property_number', 'bldgrecnum', 'linkid', 'whs_id', 'occup', 'style',
       'rnumstor', 'cqual', 'ryrbuilt', 'eyrbuilt', 'condtion', 'rextwall',
       'rrooftyp', 'roofmat', 'rbasetyp', 'basesqft', 'heat', 'air', 'attic',
       'rooms', 'bedrooms', 'baths', 'halfbath', 'plumbfix', 'gartype',
       'garcap', 'yrgarblt', 'garsize', 'livabase', 'liva1st', 'liva2nd',
       'livaupp', 'livatot', 'pctcompl', 'reinsp', 'flrloc', 'partywal', 'RCN',
       'RCNLD', 'override', 'incomeflag', 'phypctgood', 'funpctgood',
       'ecopctgood', 'resupdatedt', 'bfinsqft'],
      dtype='object')

In [91]:
cquals = {'Very poor': 1, 'Very poor+':2, 'Poor':3,'Poor+':4,'Average':5,'Average+':6,'Good':7,'Good+':8,'Very good':9,'Very good+':10,'Excellent':11, 'Excellent+':12}

In [92]:
res['cqual_num'] = res[res.cqual.notnull()].apply(lambda x: cquals[x.cqual], axis=1)

In [93]:
tci = pd.merge(tci, res[['cqual_num','occup','rnumstor']], how='left', left_on='parcel', right_index=True)

## Clean and merge data from property characteristics

In [94]:
# prop13 = pd.read_csv(path+'/data/clean_data/main_prop_filtered14.csv')
# prop13 = prop13.drop_duplicates()
# prop13 = prop13.set_index('parcel')

In [7]:
prop = pd.read_csv(path+'/data/clean_data/main_prop_filtered.csv')
prop = prop.drop_duplicates()
prop = prop.groupby('parcel').last()

### Property size

In [96]:
def parse_comma(x):
    if type(x) == str:
        x = int(x.replace(',',''))
    elif type(x) == float:
        if not np.isnan(x):
            x = int(x)
        else:
            pass
    else:
        pass
    return x

prop['propsize'] = prop['propsize'].apply(parse_comma)

tci = pd.merge(tci, prop[['propsize']], how='left', left_on='parcel', right_index=True)
tci.loc[tci.propsize.isnull(),'propsize'] = np.median(tci.loc[tci.propsize.notnull(),'propsize'])
print(tci.shape)

(113132, 10)


In [97]:
def reject_outliers(data, m=1):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

### Total usable area

In [98]:
prop['totusabl'] = prop['totusabl'].apply(parse_comma)
usable_area = prop[['totusabl']]

tci = pd.merge(tci, usable_area, how='left', left_on='parcel', right_index=True)

tci.loc[tci.totusabl.isnull(),'totusabl'] = np.median(tci.loc[tci.totusabl.notnull(),'totusabl'])
print(tci.shape)

(113132, 11)


In [99]:
median_usable = dict(tci[tci.totusabl>0].groupby('SPA_NAME').agg(np.median)['totusabl'])
tci.loc[tci.totusabl==0, 'totusabl'] = tci[tci.totusabl==0].apply(lambda x: median_usable[x.SPA_NAME], axis=1)

### Total market value

In [100]:
tci = pd.merge(tci, prop[['tmktval']], how='left', left_on='parcel', right_index=True)
tci.loc[tci.tmktval.isnull(),'tmktval'] = np.median(tci.loc[tci.tmktval.notnull(),'tmktval'])

median_val = dict(tci[tci.tmktval>0].groupby('SPA_NAME').agg(np.median)['tmktval'])
tci.loc[tci.tmktval==0, 'tmktval'] = tci[tci.tmktval==0].apply(lambda x: median_val[x.SPA_NAME], axis=1)
print(tci.shape)

(113132, 12)


In [101]:
tci = pd.merge(tci, prop[['tbval']], how='left', left_on='parcel', right_index=True)
tci.loc[tci.tbval.isnull(),'tbval'] = np.median(tci.loc[tci.tbval.notnull(),'tbval'])

median_val = dict(tci[tci.tbval>0].groupby('SPA_NAME').agg(np.median)['tbval'])
tci.loc[tci.tbval==0, 'tbval'] = tci[tci.tbval==0].apply(lambda x: median_val[x.SPA_NAME], axis=1)
print(tci.shape)

In [None]:
tci = pd.merge(tci, prop[['tlval']], how='left', left_on='parcel', right_index=True)
tci.loc[tci.tlval.isnull(),'tlval'] = np.median(tci.loc[tci.tlval.notnull(),'tlval'])

median_val = dict(tci[tci.tlval>0].groupby('SPA_NAME').agg(np.median)['tlval'])
tci.loc[tci.tlval==0, 'tbval'] = tci[tci.tlval==0].apply(lambda x: median_val[x.SPA_NAME], axis=1)
print(tci.shape)

### Condition

In [102]:
# unsound
# very poor
# poor
# fair
# average
# good
# very good
# excellent
# sound

tci = pd.merge(tci, prop[['condition13']], how='left', left_on='parcel', right_index=True)
tci = pd.merge(tci, prop[['condition14']], how='left', left_on='parcel', right_index=True)

condition_value = {'Unsound': 0, 'Sound value (c)': 1, 'Very poor': 2, 'Poor': 3, 'Fair': 4, 'Average': 5, \
                   'Good': 6, 'Very good': 7, 'Excellent': 8,}

prop.loc[prop['condition13'].notnull(),'condition_value13'] = \
    prop.loc[prop['condition13'].notnull(),'condition13'].apply(lambda x: condition_value[x])
    
tci = pd.merge(tci, prop[['condition_value13']], how='left', left_on='parcel', right_index=True)
# tci.loc[tci.condition_value13.isnull(),'condition_value13'] = \
#     np.median(tci.loc[tci.condition_value13.notnull(),'condition_value13'])


prop.loc[prop['condition14'].notnull(),'condition_value14'] = \
    prop.loc[prop['condition14'].notnull(),'condition14'].apply(lambda x: condition_value[x])
    
tci = pd.merge(tci, prop[['condition_value14']], how='left', left_on='parcel', right_index=True)
# tci.loc[tci.condition_value14.isnull(),'condition_value14'] = \
#     np.median(tci.loc[tci.condition_value14.notnull(),'condition_value14'])
    
print(tci.shape)

(113132, 16)


In [103]:
mean_cond = dict(tci[tci.condition_value13>0].groupby('SPA_NAME').agg(np.mean)['condition_value13'])
tci.loc[tci.condition_value13.isnull(), 'condition_value13'] = tci[tci.condition_value13.isnull()].apply(lambda x: mean_cond[x.SPA_NAME], axis=1)

In [104]:
mean_cond = dict(tci[tci.condition_value14>0].groupby('SPA_NAME').agg(np.mean)['condition_value14'])
tci.loc[tci.condition_value14.isnull(), 'condition_value14'] = tci[tci.condition_value14.isnull()].apply(lambda x: mean_cond[x.SPA_NAME], axis=1)

In [105]:
tci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 16 columns):
parcel               113132 non-null object
vacant               113132 non-null int64
Date                 113132 non-null datetime64[ns]
Survey Category      113132 non-null object
Survey Grade         113132 non-null object
SPA_NAME             113132 non-null object
cqual_num            111765 non-null float64
occup                111765 non-null object
rnumstor             111765 non-null float64
propsize             113132 non-null float64
totusabl             113132 non-null float64
tmktval              113132 non-null float64
condition13          112559 non-null object
condition14          112595 non-null object
condition_value13    113132 non-null float64
condition_value14    113132 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1), object(7)
memory usage: 14.7+ MB


### Housing style

In [106]:
styles = ['Bungalow','CAP','Colonial','Ranch']

def get_style(x):
    if x in styles:
        return x
    else:
        return np.nan

prop['style_filtered'] = prop['style'].apply(get_style)

In [107]:
tci = pd.merge(tci, prop[['style_filtered']] \
               , how='left', left_on='parcel', right_index=True)
tci.loc[tci.style_filtered.isnull(),'style_filtered'] = 'Other'

print(tci.shape)

(113132, 17)


### Owner occupancy

In [108]:
def get_own(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return 1
    
prop['ownerocc_value'] = prop['ownerocc'].apply(get_own)

In [109]:
tci = pd.merge(tci, prop[['ownerocc_value']], \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci['ownerocc_value'].isnull(),'ownerocc_value'] = 1
print(tci.shape)

(113132, 18)


### Latitude/Longitude

In [110]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [111]:
tci = pd.merge(tci, prop[['LATITUDE','LONGITUDE']], \
               how='left', left_on='parcel', right_index=True)
# tci.loc[tci.totbldgs.isnull(),'totbldgs'] = np.median(tci.loc[tci.totbldgs.notnull(),'totbldgs'])
# tci.loc[tci.totbldgs.isnull(),'totbldgs'] = 1
print(tci.shape)

(113132, 20)


### Total buildings

In [112]:
tci = pd.merge(tci, prop[['totbldgs']], \
               how='left', left_on='parcel', right_index=True)
# tci.loc[tci.totbldgs.isnull(),'totbldgs'] = np.median(tci.loc[tci.totbldgs.notnull(),'totbldgs'])
tci.loc[tci.totbldgs.isnull(),'totbldgs'] = 1
print(tci.shape)

(113132, 21)


### Year built

In [113]:
def get_year(x):
    try:
        x = int(x)
    except ValueError:
        x = np.nan
    return x

In [114]:
prop['yrbuilt_filtered'] = prop['yrbuilt'].apply(get_year)
tci = pd.merge(tci, prop[['yrbuilt_filtered']], \
               how='left', left_on='parcel', right_index=True)
# tci.loc[tci.yrbuilt_filtered.isnull(),'yrbuilt_filtered'] = np.median(tci.loc[tci.yrbuilt_filtered.notnull(),'yrbuilt_filtered'])
print(tci.shape)

(113132, 22)


In [115]:
median_val = dict(tci[tci.yrbuilt_filtered.notnull()].groupby('SPA_NAME').agg(np.median)['yrbuilt_filtered'])
tci.loc[tci.yrbuilt_filtered.isnull(), 'yrbuilt_filtered'] = tci[tci.yrbuilt_filtered.isnull()].apply(lambda x: median_val[x.SPA_NAME], axis=1)

### Exterior Wall

In [116]:
tci = pd.merge(tci, prop[['rextwall']], \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci.rextwall.isnull(),'rextwall'] = 'Aluminum/Vinyl (r)'
tci['rextwall'] = tci.rextwall.apply(lambda x: x if x in ['Aluminum/Vinyl (r)','Brick','Frame (r)'] else 'Other')

print(tci.shape)

(113132, 23)


## Add demographic variables

In [117]:
dem = pd.read_csv(path+'/data/clean_data/demographic.csv', dtype=str)

In [118]:
new_cols = ['parcel','NAME10','vacant_block','owner','renter', 'median_rent','property_crimes','burglaries','other_drug', 'part_one_crimes','part_two_crimes', \
'bachelors+','poverty_rate','median_hh_income', 'median_fam_income','white','black','asian','other','hispanic','young','middle','old']

use_cols = ['vacant_block', 'NAME10','median_rent','property_crimes','burglaries', 'part_one_crimes','part_two_crimes', \
'bachelors+','poverty_rate','median_hh_income','white','black','asian','other','hispanic','young','middle','old']

dem.columns = new_cols

In [119]:
cols = dem.dtypes[dem.dtypes=='object'].index[1:]

for col in cols:
#     print(col, len(dem.loc[dem[col]=='null', col]))
    dem.loc[dem[col]=='null', col] = np.nan
    dem.loc[dem[col].notnull(), col] = dem.loc[dem[col].notnull(), col].apply(float)

In [120]:
dem = dem.set_index('parcel')
tci = pd.merge(tci, dem[use_cols], left_on='parcel',right_index=True, how='left')

In [121]:
for col in use_cols:
    tci[col] = tci[col].apply(float)
    median_val = dict(tci[tci[col].notnull()].groupby('SPA_NAME').agg(np.median)[col])
    tci.loc[tci[col].isnull(), col] = tci[tci[col].isnull()].apply(lambda x: median_val[x.SPA_NAME], axis=1)
#     tci.loc[tci[col].isnull(),col] = np.median(tci.loc[tci[col].notnull(),col])

# Output

In [122]:
tci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 41 columns):
parcel               113132 non-null object
vacant               113132 non-null int64
Date                 113132 non-null datetime64[ns]
Survey Category      113132 non-null object
Survey Grade         113132 non-null object
SPA_NAME             113132 non-null object
cqual_num            111765 non-null float64
occup                111765 non-null object
rnumstor             111765 non-null float64
propsize             113132 non-null float64
totusabl             113132 non-null float64
tmktval              113132 non-null float64
condition13          112559 non-null object
condition14          112595 non-null object
condition_value13    113132 non-null float64
condition_value14    113132 non-null float64
style_filtered       113132 non-null object
ownerocc_value       113132 non-null float64
LATITUDE             112857 non-null float64
LONGITUDE            112857 non-null 

In [123]:
tci.to_csv(path+'/data/model_data/tci_2_0.csv', index=False)