In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import datetime as dt
%matplotlib inline
sns.set_context('notebook')

path = '/'.join(os.getcwd().split('/')[:-2])
print(path)

tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'PIN': str})
cols = ['parcel','vacant','Date','Survey Category','Survey Grade','SPA_NAME']
tci = tci[cols]

ppns = set(tci['parcel'])
dates = dict(zip(tci.parcel, tci.Date))

print(tci.shape)

/Volumes/Dropbox/largetransfer/luc/carter
(113132, 6)


In [2]:
# read data sets
fc = pd.read_csv(path+'/data/clean_data/foreclosure_filings2.csv', parse_dates=[2])
sa = pd.read_csv(path+'/data/clean_data/sheriff_auction.csv', parse_dates=[2], encoding="ISO-8859-1")

t = pd.read_csv(path+'/data/clean_data/transfers.csv', parse_dates=['mdate'], dtype=str)
t = t.sort_values('mdate', ascending=False)

al = pd.read_csv(path+'/data/clean_data/armslength.csv', dtype=str)
al = al.drop_duplicates()

# parse armslength dates
months = dict(zip(['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC'],range(1,13)))
def parse_date(x):
    day = int(x[0:2])
    month = months[x[2:5]]
    year = int(x[5:])
    return dt.datetime(year,month,day)

al['date'] = al.mdate.apply(parse_date)
al = al.sort_values('date')

def parse_amount(x):
    return float(str(x)[1:].replace(',',''))

## Foreclosure data

In [3]:
df = tci.copy()

# foreclosures
fc_copy = fc.loc[fc.apply(lambda x: x.filedate < dates[x.parcel], axis=1)].sort_values('filedate')
fc_copy = fc_copy.groupby('caseno').last()
fc_copy = pd.merge(fc_copy, tci[['parcel','Date']].set_index('parcel'), how='left',left_on='parcel',right_index=True)

# is it an active foreclosure
df.loc[df.parcel.isin(fc_copy.loc[fc_copy.status=='Active','parcel']),'active_fc'] = 1
df.loc[df.active_fc.isnull(),'active_fc'] = 0

# has it had a foreclosure in the last year
df.loc[df.parcel.isin(fc_copy[fc_copy.filedate > (fc_copy.Date-pd.DateOffset(years=1))].parcel),'fc_1yr'] = 1
df.loc[df.fc_1yr.isnull(), 'fc_1yr'] = 0
# has it had a foreclosure in the last 2 years
df.loc[df.parcel.isin(fc_copy[fc_copy.filedate > (fc_copy.Date-pd.DateOffset(years=2))].parcel),'fc_2yr'] = 1
df.loc[df.fc_2yr.isnull(), 'fc_2yr'] = 0

# days since last foreclosure
recent_fc = fc_copy.sort_values('filedate').groupby('parcel').last().reset_index()
fc_days = pd.DataFrame(recent_fc[['parcel','filedate','Date']])

fc_days['fc_days'] = (fc_days.Date-fc_days.filedate).astype(pd.Timedelta).apply(lambda x: x.days)
fc_days = fc_days.set_index('parcel')
df = pd.merge(df, fc_days[['fc_days']], left_on='parcel',right_index=True, how='left')
df.loc[df.fc_days.isnull(),'fc_days'] = max(df[df.fc_days.notnull()].fc_days)+1

In [4]:
# sheriff's auction

sa_copy = pd.merge(sa, tci[['parcel','Date']].set_index('parcel'), how='left',left_on='parcel',right_index=True)

sa_copy = sa_copy.loc[sa_copy.salesdt < sa_copy.Date]
# days since last sheriff's auction
recent_sa = sa_copy.sort_values('salesdt').groupby('parcel').last().reset_index()
sa_days = pd.DataFrame(recent_sa[['parcel','salesdt','Date']])
sa_days['sa_days'] = (sa_days.Date-sa_days.salesdt).astype(pd.Timedelta).apply(lambda x: x.days)
sa_days = sa_days.set_index('parcel')
df = pd.merge(df, sa_days[['sa_days']], left_on='parcel',right_index=True, how='left')
df.loc[df.sa_days.isnull(),'sa_days'] = max(df[df.sa_days.notnull()].sa_days)+1

## Transfers

In [5]:
t_copy = pd.merge(t, tci[['parcel','Date']].set_index('parcel'), how='left',left_on='PROPERTY_NUMBER',right_index=True)
t_copy = t_copy.loc[(t_copy.mdate < t_copy.Date) & (t_copy.mdate > dt.datetime(2000,1,1))]
# days since transfer
days_since_transfer = t_copy.groupby('PROPERTY_NUMBER').first()
days_since_transfer['t_days'] = (days_since_transfer.Date - days_since_transfer.mdate).astype(pd.Timedelta).apply(lambda x: x.days)
df = pd.merge(df, days_since_transfer[['t_days']], left_on='parcel', right_index=True,how='left')
df.loc[df.t_days.isnull(),'t_days'] = max(df.loc[df.t_days.notnull(),'t_days'])+1


In [6]:
# days since transfer with sheriff's deed type
recent_t = t_copy[(t_copy.DEED_TYPE=='Sheriffs Deed') & (t_copy.mdate > dt.datetime(2006,1,1))]\
                .groupby('PROPERTY_NUMBER').first()
df = pd.merge(df, recent_t[['mdate']], how='left', left_on='parcel', right_index=True)
df.loc[df.mdate.isnull(),'mdate'] = dt.datetime(2006,1,1)
df['mdate'] = (df.Date-df.mdate).astype(pd.Timedelta).apply(lambda x: x.days)
df = df.rename(columns={'mdate':'t_shf_deed'})

In [7]:
# average number of transfers per year since 2006
df = pd.merge(df, t_copy[['PROPERTY_NUMBER','mdate']] \
               .groupby('PROPERTY_NUMBER').count(), how='left', left_on='parcel', right_index=True)
df['mdate'] = df['mdate'].fillna(0)
df = df.rename(columns = {'mdate':'t_num'})
per = ((df.Date - dt.datetime(2006,1,1)).astype(pd.Timedelta).apply(lambda x: x.days)/365.0)
df['t_num'] = df['t_num']/per

## Armslength sales

In [8]:
al_copy = pd.merge(al, tci[['parcel','Date']].set_index('parcel'), how='left',left_on='PROPERTY_NUMBER',right_index=True)

In [9]:
al_copy = al_copy.loc[al_copy.date < al_copy.Date]

In [10]:
# armslength
# al_copy = pd.merge(al, tci[['parcel','Date']].set_index('parcel'), how='left',left_on='PROPERTY_NUMBER',right_index=True)

# al_copy = al_copy.loc[al_copy.date < al_copy.Date]
al_copy = al_copy.loc[al_copy.date > dt.datetime(2006,1,1),['date','PROPERTY_NUMBER']]\
               .groupby('PROPERTY_NUMBER')\
               .count()/per
# number of armslength sales per year since 2006
# df = pd.merge(df, al_copy, how='left', left_on='parcel', right_index=True)
# df = df.rename(columns = {'date':'al_num'})
# df.loc[df.al_num.isnull(),'al_num'] = 0

KeyboardInterrupt: 

In [None]:


# days since most recent sale since 2006
df = pd.merge(df, al_copy[['date','PROPERTY_NUMBER']]\
               .groupby('PROPERTY_NUMBER')\
               .last()\
                    ,how='left', left_on='parcel', right_index=True)

In [None]:
# al = pd.read_csv(path+'/data/clean_data/armslength.csv', dtype=str)
# al = al.drop_duplicates()

months = dict(zip(['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC'],range(1,13)))

def parse_date(x):
    day = int(x[0:2])
    month = months[x[2:5]]
    year = int(x[5:])
    return dt.datetime(year,month,day)

al['date'] = al.mdate.apply(parse_date)
al = al.sort_values('date')
# al = al[al.date < dt.date(2015,6,1)]

al = al.loc[al.apply(lambda x: x.date < dates[x.PROPERTY_NUMBER], axis=1)]

# number of sales per year since 2006

tci = pd.merge(tci, al.loc[al.date > dt.datetime(2006,1,1),['date','PROPERTY_NUMBER']]\
               .groupby('PROPERTY_NUMBER')\
               .agg(lambda x: len(x)\
                    /((dates[x.PROPERTY_NUMBER.iloc[0]] - dt.datetime(2006,1,1)).days/365.0))\
                    ,how='left', left_on='parcel', right_index=True)
tci = tci.rename(columns = {'date':'al_num'})
tci.loc[tci.al_num.isnull(),'al_num'] = 0

# days since most recent sale since 2006

tci = pd.merge(tci, al.loc[al.date > dt.datetime(2006,1,1),['date','PROPERTY_NUMBER']]\
               .groupby('PROPERTY_NUMBER')\
               .last()\
                    ,how='left', left_on='parcel', right_index=True)
tci = tci.rename(columns = {'date':'al_date'})

tci.loc[tci.al_date.isnull(),'al_date'] = dt.datetime(2006,1,1)
tci['al_days'] = tci.apply(lambda x: (x.Date-x.al_date).days, axis=1)


In [None]:
tci.shape

In [55]:
cols = list(tci.columns)
cols.remove('vacant')
cols.remove('Date')
print(cols)

['parcel', 'active_fc', 'fc_1yr', 'fc_2yr', 'fc_days', 'sa_days', 't_days', 't_shf_deed', 't_num', 'al_num', 'al_date', 'al_days']


In [50]:
tci = tci.fillna(0)

In [58]:
tci[cols].to_csv(path+'/data/model_data/tci_2_3.csv',index=False)

In [57]:
tci[cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113132 entries, 0 to 113131
Data columns (total 12 columns):
parcel        113132 non-null object
active_fc     113132 non-null float64
fc_1yr        113132 non-null float64
fc_2yr        113132 non-null float64
fc_days       113132 non-null float64
sa_days       113132 non-null float64
t_days        113132 non-null int64
t_shf_deed    113132 non-null float64
t_num         113132 non-null float64
al_num        113132 non-null float64
al_date       113132 non-null datetime64[ns]
al_days       113132 non-null int64
dtypes: datetime64[ns](1), float64(8), int64(2), object(1)
memory usage: 11.2+ MB
