In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import datetime as dt
%matplotlib inline
sns.set_context('notebook')

path = '/'.join(os.getcwd().split('/')[:-2])
print(path)

tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'PIN': str})
cols = ['parcel','vacant','Date','Survey Category','Survey Grade','SPA_NAME']
tci = tci[cols]

/Volumes/Dropbox/largetransfer/luc/carter


In [7]:
# read data sets
fc = pd.read_csv(path+'/data/clean_data/foreclosure_filings2.csv', parse_dates=[2])
sa = pd.read_csv(path+'/data/clean_data/sheriff_auction.csv', parse_dates=[2], encoding="ISO-8859-1")

t = pd.read_csv(path+'/data/clean_data/transfers.csv', parse_dates=['mdate'], dtype=str)
t = t.sort_values('mdate', ascending=False)

al = pd.read_csv(path+'/data/clean_data/armslength.csv', dtype=str)
al = al.drop_duplicates()

# parse armslength dates
months = dict(zip(['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC'],range(1,13)))
def parse_date(x):
    day = int(x[0:2])
    month = months[x[2:5]]
    year = int(x[5:])
    return dt.datetime(year,month,day)

al['date'] = al.mdate.apply(parse_date)
al = al.sort_values('date')

In [8]:
def parse_amount(x):
    return float(str(x)[1:].replace(',',''))

In [9]:
rng = pd.date_range('10/1/2009', periods=23, freq='3MS')

for date in rng:
    df = tci.copy()
    
    # foreclosures
    fc_copy = fc.loc[fc.filedate < date]
    fc_copy = fc_copy.sort_values('filedate')
    fc_copy = fc_copy.groupby('caseno').last()
    
    # is it an active foreclosure
    df.loc[df.parcel.isin(fc_copy.loc[fc_copy.status=='Active','parcel']),'active_fc'] = 1
    df.loc[df.active_fc.isnull(),'active_fc'] = 0
    
    # has it had a foreclosure in the last year
    df.loc[df.parcel.isin(fc_copy[fc_copy.filedate > (date-pd.DateOffset(years=1))].parcel),'fc_1yr'] = 1
    df.loc[df.fc_1yr.isnull(), 'fc_1yr'] = 0
    # has it had a foreclosure in the last 2 years
    df.loc[df.parcel.isin(fc_copy[fc_copy.filedate > (date-pd.DateOffset(years=2))].parcel),'fc_2yr'] = 1
    df.loc[df.fc_2yr.isnull(), 'fc_2yr'] = 0
    
    # days since last foreclosure
    recent_fc = fc_copy.sort_values('filedate').groupby('parcel').last().reset_index()
    fc_days = pd.DataFrame(recent_fc[['parcel','filedate']])
    fc_days['fc_days'] = (date-fc_days.filedate).astype(pd.Timedelta).apply(lambda x: x.days)
    fc_days = fc_days.set_index('parcel')
    df = pd.merge(df, fc_days[['fc_days']], left_on='parcel',right_index=True, how='left')
    df.loc[df.fc_days.isnull(),'fc_days'] = max(df[df.fc_days.notnull()].fc_days)+1

    # sheriff's auction
    sa_copy = sa.loc[sa.salesdt < date]
    # days since last sheriff's auction
    recent_sa = sa.sort_values('salesdt').groupby('parcel').last().reset_index()
    sa_days = pd.DataFrame(recent_sa[['parcel','salesdt']])
    sa_days['sa_days'] = (date-sa_days.salesdt).astype(pd.Timedelta).apply(lambda x: x.days)
    sa_days = sa_days.set_index('parcel')
    df = pd.merge(df, sa_days[['sa_days']], left_on='parcel',right_index=True, how='left')
    df.loc[df.sa_days.isnull(),'sa_days'] = max(df[df.sa_days.notnull()].sa_days)+1
    
    # transfers
    t_copy = t.loc[t.mdate < date]
    # days since transfer
    days_since_transfer = t_copy.groupby('PROPERTY_NUMBER').first()
    df = pd.merge(df, days_since_transfer[['mdate']], left_on='parcel', right_index=True,how='left')
    df.loc[df.mdate.isnull(),'mdate'] = dt.datetime(2006,1,1)
    df['mdate'] = (date-df.mdate).astype(pd.Timedelta).apply(lambda x: x.days)
    df = df.rename(columns={'mdate':'t_days'})
    
    # days since transfer with sheriff's deed type
    recent_t = t_copy[(t_copy.DEED_TYPE=='Sheriffs Deed') & (t_copy.mdate > dt.datetime(2006,1,1))]\
                    .groupby('PROPERTY_NUMBER').first()
    df = pd.merge(df, recent_t[['mdate']], how='left', left_on='parcel', right_index=True)
    df.loc[df.mdate.isnull(),'mdate'] = dt.datetime(2006,1,1)
    df['mdate'] = (date-df.mdate).astype(pd.Timedelta).apply(lambda x: x.days)
    df = df.rename(columns={'mdate':'t_shf_deed'})
        
    # average number of transfers per year since 2006
    df = pd.merge(df, t_copy.loc[t_copy.mdate > dt.datetime(2006,1,1), ['PROPERTY_NUMBER','mdate']] \
                   .groupby('PROPERTY_NUMBER').count(), how='left', left_on='parcel', right_index=True)
    df['mdate'] = df['mdate'].fillna(0)
    df = df.rename(columns = {'mdate':'t_num'})
    per = ((date - dt.datetime(2006,1,1)).days/365.0)
    df['t_num'] = df.t_num/per
    
    # armslength
    al_copy = al.loc[al.date < date]
    # number of armslength sales per year since 2006
    df = pd.merge(df, al_copy.loc[al_copy.date > dt.datetime(2006,1,1),['date','PROPERTY_NUMBER']]\
                   .groupby('PROPERTY_NUMBER')\
                   .count()/per\
                        ,how='left', left_on='parcel', right_index=True)
    df = df.rename(columns = {'date':'al_num'})
    df.loc[df.al_num.isnull(),'al_num'] = 0

    # days since most recent sale since 2006
    df = pd.merge(df, al_copy[['date','PROPERTY_NUMBER']]\
                   .groupby('PROPERTY_NUMBER')\
                   .last()\
                        ,how='left', left_on='parcel', right_index=True)
    

    df.loc[df.date.isnull(),'date'] = dt.datetime(2006,1,1)
    df = df.rename(columns = {'date':'al_days'})
    df['al_days'] = (date-df.al_days).astype(pd.Timedelta).apply(lambda x: x.days)
#     break
    df.to_csv(path+'/data/model_data/tci_2_3_'+str(date.year)+'_'+str(date.month)+'.csv', index=False)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 16 columns):
parcel             113132 non-null object
vacant             113132 non-null int64
Date               113132 non-null datetime64[ns]
Survey Category    113132 non-null object
Survey Grade       113132 non-null object
SPA_NAME           113132 non-null object
active_fc          113132 non-null float64
fc_1yr             113132 non-null float64
fc_2yr             113132 non-null float64
fc_days            113132 non-null float64
sa_days            113132 non-null float64
t_days             113132 non-null int64
t_shf_deed         113132 non-null int64
t_num              113132 non-null float64
al_num             113132 non-null float64
al_days            113132 non-null int64
dtypes: datetime64[ns](1), float64(7), int64(4), object(4)
memory usage: 14.7+ MB
