# Combine Data (not from prop)

Data has now been filtered for the parcel numbers in the existing dataset. The end goal is to have a dataset with one row for each parcel in which to build the model. **Imputation will occur in a separate notebook!**

However, we can't forget to explore the existing data for trends or insight along the way, as well as engineer features as we see fit.

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import datetime as dt

%matplotlib inline
sns.set_context('notebook')


path = '/'.join(os.getcwd().split('/')[:-2])
print(path)

tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'PIN': str})
cols = ['parcel','vacant','Date','Survey Category','Survey Grade','SPA_NAME']
tci = tci[cols]

dates = dict(zip(tci.parcel, tci.Date))
ppns = set(tci['parcel'])

/Volumes/Dropbox/largetransfer/luc/carter




(113132, 6)

## County Land Bank

In [3]:
lb = pd.read_csv(path+'/data/clean_data/county_lb.csv')

In [4]:
lb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 1099
Data columns (total 16 columns):
parcel            1100 non-null object
disp_status       1100 non-null object
p_source          1084 non-null object
acq_dt            958 non-null object
disp_dt           773 non-null object
assessment        1043 non-null object
demo_status       257 non-null object
rehab_status      810 non-null object
public_status     970 non-null object
s_date            227 non-null object
cclrc_dba_date    57 non-null object
cclrc_dc_date     94 non-null object
ab_proceed_dt     68 non-null object
proceed_date      79 non-null object
cclrc_dcp_date    53 non-null object
out_type          835 non-null object
dtypes: object(16)
memory usage: 146.1+ KB


In [5]:
lb_ppns = set(lb[lb.disp_status=='Acquired'].parcel)

def get_lb(x, lb_ppns):
    if x in lb_ppns:
        return 1
    else:
        return 0

tci['lb_acquired'] = tci['parcel'].apply(lambda x: get_lb(x, lb_ppns))
tci['lb_tax_fc'] = tci['parcel'].apply(lambda x: get_lb(x,set(lb[lb.p_source=='Tax Foreclosure'].parcel)))

## Tax bill

In [9]:
# tb2 = pd.read_csv(path+'/data/clean_data/taxbill_sep14.csv', dtype=object)
tb = pd.read_csv(path+'/data/clean_data/taxbill_may15.csv', dtype=object)

In [10]:
tb['TOTAL_NET_DELQ_BALANCE'] = tb['TOTAL_NET_DELQ_BALANCE'].apply(float)
tb['GRAND_TOTAL_BALANCE'] = tb['GRAND_TOTAL_BALANCE'].apply(float)
tb['GRAND_TOTAL_OWED'] = tb['GRAND_TOTAL_OWED'].apply(float)

# create new variables, delinquent tax ratio and paid percent, fill with medians (0 and 1)
tb['delq_total_ratio'] = tb['TOTAL_NET_DELQ_BALANCE']/tb['GRAND_TOTAL_OWED']
tb['delq_total_ratio'] = tb['delq_total_ratio'].fillna(0)

tb['paid_percent'] = tb.GRAND_TOTAL_PAID.apply(float)/tb.GRAND_TOTAL_OWED.apply(float)
tb['paid_percent'] = tb['paid_percent'].fillna(1)

tb = tb.fillna(0)
# tb = tb.set_index('PROPERTY_NUMBER')
cols = ['TOTAL_NET_DELQ_BALANCE','TAX_ASSESSED_LAND','TAX_MARKET_LAND',\
        'LENDER_PROCESS_TYPE','GRAND_TOTAL_BALANCE','GRAND_TOTAL_PAID',\
        'GRAND_TOTAL_OWED','delq_total_ratio','paid_percent','PROPERTY_NUMBER']

cols2 = ['TOTAL_NET_DELQ_BALANCE','TAX_ASSESSED_LAND','TAX_MARKET_LAND',\
         'GRAND_TOTAL_BALANCE','GRAND_TOTAL_PAID',\
         'GRAND_TOTAL_OWED','delq_total_ratio','paid_percent']

tb = tb.sort_values('date')

df = pd.merge(df, tb.loc[tb.date<=date, cols].groupby('PROPERTY_NUMBER').last(),\
                  left_on='parcel', right_index=True,how='left')
    # fill medians by neighborhood if possible, else by entire dataset
    for var in cols2:
        median_val = dict(df.loc[df[var].notnull(),[var,'SPA_NAME']].groupby('SPA_NAME').agg(np.median)[var])
        df.loc[df[var].isnull(), var] = df.loc[df[var].isnull()]\
            .apply(lambda x: median_val[x.SPA_NAME] if x.SPA_NAME in median_val else np.median(df.loc[df[var].notnull()]), axis=1)
    # fill lender process type
    df.loc[df.LENDER_PROCESS_TYPE.isnull(),'LENDER_PROCESS_TYPE'] = 'None'


In [11]:
tci = pd.merge(tci, tb[cols], left_on='parcel', right_index=True,how='left')
tci.loc[tci.LENDER_PROCESS_TYPE==0,'LENDER_PROCESS_TYPE'] = 'None'
tb2 = tb2.rename(columns={'GRAND_TOTAL_BALANCE':'GRAND_TOTAL_BALANCE_14'})
tci = pd.merge(tci, tb2[['GRAND_TOTAL_BALANCE_14']], left_on='parcel', right_index=True,how='left')

del tb, tb2

## Postal vacancy

In [12]:
pv = pd.read_csv(path+'/data/clean_data/postal_vacancy.csv', parse_dates=3, index_col=0)
pv.date = pv.date.apply(lambda x: dt.datetime(int(x[0:4]),int(x[5:7]),int(x[8:10])))
pv = pv.sort_values('date',ascending=False)

In [15]:
# dates = list()
# tp = list()
# tn = list()
# fp = list()
# fn = list()
# for time in sorted(list(set(pv.date))):
#     dates.append(time)
#     pv2 = pd.merge(tci, pv[(pv.date==time)], how='left', left_on='parcel', right_on='PARCEL')
#     pv2['vindall'] = pv2['vindall'].fillna('N')
#     tem = pd.crosstab(pv2.vindall, pv2.vacant)
#     tp.append(tem[1]['Y'])
#     tn.append(tem[0]['N'])
#     fp.append(tem[0]['Y'])
#     fn.append(tem[1]['N'])
    
# # import matplotlib.dates as mdates
# fig, ax = plt.subplots()

# plt.plot(dates, tp)
# # plt.plot(dates, tn)
# plt.plot(dates, fp)
# plt.plot(dates, fn);

# x1,x2,y1,y2 = plt.axis()
# plt.axis([x1,x2,0,y2]);
# plt.legend(['Both predicted vacant','Vacant (only postal)','Vacant (only survey)'],'lower right');

# fig.autofmt_xdate()

# # fig.savefig(path+'/outputs/pv.png')

In [16]:
# pv2 = pd.merge(tci, pv[(pv.date==dt.datetime(2015,10,1))], how='left', left_on='parcel', right_on='PARCEL')
# pv2['vindall'] = pv2['vindall'].fillna('N')
# pd.crosstab(pv2.vindall, pv2.vacant)

In [17]:
# pv2 = pd.melt(pv.pivot(index='PARCEL',columns='date').fillna('N').reset_index(),id_vars='PARCEL',value_name='vindall')


In [20]:
def fun(x):
    total = 0
#     print(dates[x.parcel])
    recent = pv_dates[pv_dates<dates[x.parcel]]

    for i,j in enumerate(x[recent]):
        if x[recent][-i-1] == 'Y':
            total += 1
        else:
            return total
    return total

pv_dates = pv.groupby('date').last().index
pv2 = pv.pivot(index='PARCEL',columns='date').fillna('N').reset_index()
pv2.columns = pv2.columns.droplevel()
pv2.columns.values[0] = 'parcel'
pv2['pv_count'] = pv2.apply(fun, axis=1)
pv2 = pv2.set_index('parcel')

In [25]:
tci = pd.merge(tci, pv2[['pv_count']], how='left', left_on='parcel',right_index=True)
tci['pv_count'] = tci['pv_count'].fillna(0)

In [206]:
# tci.loc[tci.pv_count>0,'pv_count2'] = 1
# tci.loc[tci.pv_count==0,'pv_count2'] = 0

In [207]:
# tci.loc[tci.delq_total_ratio>0.7,'del_ratio'] = 1
# tci.loc[tci.delq_total_ratio<=0.7,'del_ratio'] = 0

In [208]:
# t = tci[(tci.pv_count2==1) & (tci.del_ratio==1)].vacant
# print(sum(t),len(t))

# t = tci[(tci.pv_count2==0) & (tci.del_ratio==1)].vacant
# print(sum(t),len(t))

# t = tci[(tci.pv_count2==1) & (tci.del_ratio==0)].vacant
# print(sum(t),len(t))

# t = tci[(tci.pv_count2==0) & (tci.del_ratio==0)].vacant
# print(sum(t),len(t))


In [26]:
cols = list(tci.columns)
cols.remove('vacant')
cols.remove('Date')

In [27]:
tci[cols].to_csv(path+'/data/model_data/tci_2_1.csv', index=False)

In [28]:
tci.columns

Index(['parcel', 'Date', 'vacant', 'lb_acquired', 'lb_tax_fc',
       'TOTAL_NET_DELQ_BALANCE', 'TAX_ASSESSED_LAND', 'TAX_MARKET_LAND',
       'LENDER_PROCESS_TYPE', 'GRAND_TOTAL_BALANCE', 'GRAND_TOTAL_PAID',
       'delq_total_ratio', 'paid_percent', 'GRAND_TOTAL_BALANCE_14',
       'pv_count'],
      dtype='object')

In [29]:
tci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 15 columns):
parcel                    113132 non-null object
Date                      113132 non-null datetime64[ns]
vacant                    113132 non-null int64
lb_acquired               113132 non-null int64
lb_tax_fc                 113132 non-null int64
TOTAL_NET_DELQ_BALANCE    113129 non-null float64
TAX_ASSESSED_LAND         113129 non-null object
TAX_MARKET_LAND           113129 non-null object
LENDER_PROCESS_TYPE       113129 non-null object
GRAND_TOTAL_BALANCE       113129 non-null float64
GRAND_TOTAL_PAID          113129 non-null object
delq_total_ratio          113129 non-null float64
paid_percent              113129 non-null float64
GRAND_TOTAL_BALANCE_14    113107 non-null float64
pv_count                  113132 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(3), object(5)
memory usage: 13.8+ MB
