# Combine Data (not from prop)

Data has now been filtered for the parcel numbers in the existing dataset. The end goal is to have a dataset with one row for each parcel in which to build the model. **Imputation will occur in a separate notebook!**

However, we can't forget to explore the existing data for trends or insight along the way, as well as engineer features as we see fit.

In [41]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import datetime as dt

%matplotlib inline
sns.set_context('notebook')


path = '/'.join(os.getcwd().split('/')[:-2])
print(path)

tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'PIN': str})
cols = ['parcel','vacant','Date','Survey Category','Survey Grade','SPA_NAME']
tci = tci[cols]

dates = dict(zip(tci.parcel, tci.Date))
ppns = set(tci['parcel'])

/Volumes/Dropbox/largetransfer/luc/carter


## County Land Bank

In [56]:
lb = pd.read_csv(path+'/data/clean_data/county_lb.csv',parse_dates=['acq_dt'])
df = tci.copy()
# filter land bank data by date
lb2 = lb.loc[lb.apply(lambda x: x.acq_dt < dates[x.parcel], axis=1)]
# has it been acquired by a land bank
df.loc[df.parcel.isin(lb2[lb2.disp_status=='Acquired'].parcel), 'lb_acquired'] = 1
df.loc[df.lb_acquired.isnull(),'lb_acquired'] = 0
# source of acquisition
df.loc[df.parcel.isin(lb2[lb2.p_source=='Tax Foreclosure'].parcel), 'lb_tax_fc'] = 1
df.loc[df.lb_tax_fc.isnull(),'lb_tax_fc'] = 0

## Tax bill

In [57]:
# tb2 = pd.read_csv(path+'/data/clean_data/taxbill_sep14.csv', dtype=object)
tb = pd.read_csv(path+'/data/clean_data/taxbill_may15.csv', dtype=object)

tb['TOTAL_NET_DELQ_BALANCE'] = tb['TOTAL_NET_DELQ_BALANCE'].apply(float)
tb['GRAND_TOTAL_BALANCE'] = tb['GRAND_TOTAL_BALANCE'].apply(float)
tb['GRAND_TOTAL_OWED'] = tb['GRAND_TOTAL_OWED'].apply(float)
tb['TAX_ASSESSED_LAND'] = tb['TAX_ASSESSED_LAND'].apply(float)
tb['TAX_MARKET_LAND'] = tb['TAX_MARKET_LAND'].apply(float)
tb['GRAND_TOTAL_PAID'] = tb['GRAND_TOTAL_PAID'].apply(float)


# create new variables, delinquent tax ratio and paid percent, fill with medians (0 and 1)
tb['delq_total_ratio'] = tb['TOTAL_NET_DELQ_BALANCE']/tb['GRAND_TOTAL_OWED']
tb['delq_total_ratio'] = tb['delq_total_ratio'].fillna(0)

tb['paid_percent'] = tb.GRAND_TOTAL_PAID.apply(float)/tb.GRAND_TOTAL_OWED.apply(float)
tb['paid_percent'] = tb['paid_percent'].fillna(1)

tb = tb.fillna(0)
# tb = tb.set_index('PROPERTY_NUMBER')
cols = ['TOTAL_NET_DELQ_BALANCE','TAX_ASSESSED_LAND','TAX_MARKET_LAND',\
        'LENDER_PROCESS_TYPE','GRAND_TOTAL_BALANCE','GRAND_TOTAL_PAID',\
        'GRAND_TOTAL_OWED','delq_total_ratio','paid_percent','PROPERTY_NUMBER']

cols2 = ['TOTAL_NET_DELQ_BALANCE','TAX_ASSESSED_LAND','TAX_MARKET_LAND',\
         'GRAND_TOTAL_BALANCE','GRAND_TOTAL_PAID',\
         'GRAND_TOTAL_OWED','delq_total_ratio','paid_percent']

# tb = tb.sort_values('date')

df = pd.merge(df, tb[cols].groupby('PROPERTY_NUMBER').last(),\
              left_on='parcel', right_index=True,how='left')
# fill medians by neighborhood if possible, else by entire dataset
for var in cols2:
    median_val = dict(df.loc[df[var].notnull(),[var,'SPA_NAME']].groupby('SPA_NAME').agg(np.median)[var])
    df.loc[df[var].isnull(), var] = df.loc[df[var].isnull()]\
        .apply(lambda x: median_val[x.SPA_NAME] if x.SPA_NAME in median_val else np.median(df.loc[df[var].notnull()]), axis=1)
# fill lender process type
df.loc[df.LENDER_PROCESS_TYPE.isnull(),'LENDER_PROCESS_TYPE'] = 'None'

## Postal vacancy

In [58]:
pv = pd.read_csv(path+'/data/clean_data/postal_vacancy.csv', parse_dates=3, index_col=0)
pv.date = pv.date.apply(lambda x: dt.datetime(int(x[0:4]),int(x[5:7]),int(x[8:10])))
pv = pv.sort_values('date',ascending=False)

In [59]:
## Postal vacancy
pv = pd.read_csv(path+'/data/clean_data/postal_vacancy.csv', parse_dates=3, index_col=0)
# parse dates
pv.date = pv.date.apply(lambda x: dt.datetime(int(x[0:4]),int(x[5:7]),int(x[8:10])))
# pv = pv.loc[pv.apply(lambda x: x.date < dates[x.PARCEL], axis=1)]
# reshape
pv = pv.sort_values('date',ascending=False)
pv_dates = pv.groupby('date').last().index
pv2 = pv.pivot(index='PARCEL',columns='date').fillna('N').reset_index()
pv2.columns = pv2.columns.droplevel()
pv2.columns.values[0] = 'parcel'

In [60]:
# get the number of postal vacancies in a row 
def fun(x):
    total = 0
    recent = pv_dates[pv_dates<=dates[x.parcel]]
    for i,j in enumerate(x[recent]):
        if x[recent][-i-1] == 'Y':
            total += 1
        else:
            return total
    return total
pv2['pv_count'] = pv2.apply(fun, axis=1)
pv3 = pv2.set_index('parcel')
# merge postal vacancies
df = pd.merge(df, pv3[['pv_count']], how='left', left_on='parcel',right_index=True)
# fill null values
df['pv_count'] = df['pv_count'].fillna(0)
#     break
df.to_csv(path+'/data/model_data/tci_2_1_model.csv', index=False)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 18 columns):
parcel                    113132 non-null object
vacant                    113132 non-null int64
Date                      113132 non-null datetime64[ns]
Survey Category           113132 non-null object
Survey Grade              113132 non-null object
SPA_NAME                  113132 non-null object
lb_acquired               113132 non-null float64
lb_tax_fc                 113132 non-null float64
TOTAL_NET_DELQ_BALANCE    113132 non-null float64
TAX_ASSESSED_LAND         113132 non-null float64
TAX_MARKET_LAND           113132 non-null float64
LENDER_PROCESS_TYPE       113132 non-null object
GRAND_TOTAL_BALANCE       113132 non-null float64
GRAND_TOTAL_PAID          113132 non-null float64
GRAND_TOTAL_OWED          113132 non-null float64
delq_total_ratio          113132 non-null float64
paid_percent              113132 non-null float64
pv_count                  113132 non-nu