# Analysis of vacancy and foreclosures, transactions, sheriff's auctions, and armslength sales

In [2]:
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
import pandas as pd
import numpy as np
import os

%matplotlib inline

path = '/'.join(os.getcwd().split('/')[:-1])
print(path)

/Volumes/Dropbox/largetransfer/luc/carter


In [3]:
df = pd.read_csv(path+'/data/model_data/tci_2_0.csv', index_col = 0)
tci = pd.read_csv(path+'/data/model_data/tci_2_2.csv', parse_dates=['Date'], dtype={'Parcel ID':str})
tci = pd.merge(tci, df[['NAME10','SPA_NAME']], how='left', right_index=True, left_on='parcel')
ppns = set(tci['parcel'])
dates = dict(zip(tci.parcel, tci.Date))
print(tci.shape)

(113132, 39)


  interactivity=interactivity, compiler=compiler, result=result)


## Foreclosures

In [3]:
fc = pd.read_csv(path+'/data/clean_data/foreclosure_filings2.csv', parse_dates=[2])
fc = fc.sort_values('filedate').groupby('caseno').last()
fc = fc.rename(columns={'filedate':'date'}).reset_index()

fc.columns

Index(['caseno', 'parcel', 'status', 'date', 'defendant', 'parcel_address',
       'dateid', 'case_title', 'plaintiff', 'FORE', 'LATITUDE', 'LONGITUDE',
       'zip_fore'],
      dtype='object')

In [4]:
min(fc.date), max(fc.date)

(Timestamp('2006-01-03 00:00:00'), Timestamp('2015-11-06 00:00:00'))

In [5]:
fc_parcel = set(fc.parcel)
tci['fc'] = tci.apply(lambda x: x.parcel in fc_parcel, axis=1)
pd.crosstab(tci.fc, tci.vacant)

vacant,0,1
fc,Unnamed: 1_level_1,Unnamed: 2_level_1
False,79154,3836
True,23940,6202


In [6]:
sum(tci.vacant), sum(tci.fc)

(10038, 30142)

#### Has it been foreclosed before?

So of the 30,142 parcels that have been foreclosed between 1/3/2006 and 11/6/2015, 6,202 are currently vacant, of the 10,038 total residential vacancies as measured by TCI. This means that 3,836 parcels were determined to be vacancy but never have been foreclosed. 

In [7]:
fc_parcel = set(fc[fc.status=='Inactive'].parcel)
tci['fc'] = tci.apply(lambda x: x.parcel in fc_parcel, axis=1)
pd.crosstab(tci.fc, tci.vacant)

vacant,0,1
fc,Unnamed: 1_level_1,Unnamed: 2_level_1
False,79967,4135
True,23127,5903


In [8]:
fc_parcel = set(fc[fc.status=='Active'].parcel)
tci['fc'] = tci.apply(lambda x: x.parcel in fc_parcel, axis=1)
pd.crosstab(tci.fc, tci.vacant)

vacant,0,1
fc,Unnamed: 1_level_1,Unnamed: 2_level_1
False,101689,9455
True,1405,583


#### It is listed as an active or inactive foreclosure?

Active foreclosures make up about 9% of foreclosures, and active foreclosures are vacanty ~40% of the time as compared to ~20% of inactive foreclosures.

#### Plaintiff/defendant

The foreclosure dataset doesn't have plaintiffs, but it does have defendants. Not sure if there is information to mine from the defendants. 

#### Number of times foreclosed

A look at the number of times a parcel has been foreclosed and the likelihood that it is vacant. 5% of parcels that have not been foreclosed are vacant, as compared to ~23% of parcels that have been foreclosed at least once. 

In [9]:
tem = pd.merge(fc[['parcel','status']].groupby('parcel').count(), tci[['parcel','vacant']], how='right', left_index=True, right_on='parcel').fillna(0)
tem = tem[['status','vacant']].groupby('status').agg([sum,len])
tem['percent'] = tem.vacant['sum']/tem.vacant['len']
tem.T

Unnamed: 0,status,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0
vacant,sum,3836.0,4193.0,1579.0,330.0,81.0,14.0,4.0,1.0
vacant,len,82990.0,22045.0,6255.0,1422.0,326.0,71.0,15.0,8.0
percent,,0.046222,0.190202,0.252438,0.232068,0.248466,0.197183,0.266667,0.125


#### Broken down by SPAs

We can break down foreclosures and vacancies by SPAs to see how these relationships differ across neighborhoods. The first two columns, 'sum' and 'len' refer to the number of vacant properties that have been foreclosed (since 2006), and the number of properties that have been foreclosed (total since 2006). For reference is the number of parcels in the SPA and the number of vacant residential buildings in the SPA. 

Also included is the 'percent_true', which is the percent of foreclosed homes that are truely vacant, 'percent_foreclosed', the percent of the parcels in the SPA that have been foreclosed since 2006, 'vacancy_rate', the percent of parcels vacant according to TCI, and 'percent_vacancies_foreclosed', which is the percent of true vacancies that have been foreclosed since 2006.

**NOTE**: This is a count of all foreclosures.

The accuracy of using "has this been foreclosed before" as a variable ranges in accuracy from <10% to >40%. Neighborhoods with more wealth tend to have a lesser rate of foreclosure, and also tend to have vacancies that have a lesser rate of foreclosure, although that number ranges around 40-60%. This number, reflecting the percent of vacancies that have been foreclosed, is higher in east side neighborhoods. 

There are some interesting statistical notes, such as the high predictive power of foreclosures in North Shore Collinwood, which has a much lower vacancy rate than Collinwood-Nottingham, or much more vacancies with foreclosures as compared to Goodrich-Kirkland Pk.

**NOTE**: The best case scenario is to have 'percent_true' to be as high as possible. 'percent_vacancies_foreclosed' is better if it is higher, but that would imply that foreclosures and vacancies are more correlated, so there is a point to which that may be not true.

In [10]:
spas_fc = tci[tci.parcel.isin(fc.parcel)][['SPA_NAME','vacant']].groupby('SPA_NAME').agg([sum,len])
spas_fc['number_of_parcels'] = tci.groupby('SPA_NAME').count().parcel
spas_fc['number_of_vacancies'] = tci.groupby('SPA_NAME').agg(sum).vacant
spas_fc['percent_true'] = spas_fc.vacant['sum']*1.0/spas_fc.vacant['len']
spas_fc['percent_foreclosed'] = 1.0*spas_fc.vacant['len']/spas_fc.number_of_parcels
spas_fc['vacancy_rate'] = spas_fc['number_of_vacancies']/spas_fc['number_of_parcels']
spas_fc['percent_vacancies_foreclosed'] = spas_fc.vacant['sum']/spas_fc['number_of_vacancies']

In [11]:
spas_fc.sort_values('percent_true')

Unnamed: 0_level_0,vacant,vacant,number_of_parcels,number_of_vacancies,percent_true,percent_foreclosed,vacancy_rate,percent_vacancies_foreclosed
Unnamed: 0_level_1,sum,len,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SPA_NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Downtown,0,2,39,1,0.0,0.051282,0.025641,0.0
Kamm's,91,1206,9059,155,0.075456,0.133127,0.01711,0.587097
Old Brooklyn,171,2242,11113,278,0.076271,0.201746,0.025016,0.615108
Jefferson,120,1510,6331,182,0.07947,0.238509,0.028747,0.659341
Bellaire-Puritas,103,1231,5132,151,0.083672,0.239867,0.029423,0.682119
Edgewater,15,149,1106,25,0.100671,0.13472,0.022604,0.6
Tremont,29,270,1976,80,0.107407,0.13664,0.040486,0.3625
West Boulevard,160,1485,5402,228,0.107744,0.274898,0.042207,0.701754
Central,32,292,734,68,0.109589,0.39782,0.092643,0.470588
Lee-Harvard,159,1333,4579,241,0.11928,0.291112,0.052632,0.659751


## Sheriff's Auctions

In [4]:
sa = pd.read_csv(path+'/data/clean_data/sheriff_auction.csv', parse_dates=[2], encoding="ISO-8859-1")

In [9]:
reo = pd.read_csv(path+'/data/extra_data/reo.csv', encoding="utf-8")
reo = reo.groupby('GRANTEE1').last()

In [24]:
temp = pd.merge(sa, reo, how='left',left_on='purchaser', right_index=True)

In [29]:
temp[temp.purchaser.notnull()].shape

(29638, 27)

In [32]:
temp[~temp.purchaser.isin(reo.index)].purchaser

0                                                      NaN
1                                                      NaN
2                                                      NaN
3                                                      NaN
4                                                      N/a
5                                           ROBERT RUSSELL
7                                                      NaN
8                                                      NaN
9                                                      NaN
10                                                     NaN
13                                                     NaN
15                                                     N/a
16                                                     N/a
17                                                     N/a
18       THE BANK OF NEW YORK MELLON FKA THE BANK OF NE...
19                                                     NaN
20       DEUTSCHE BANK TRUST COMPANY AMERICAS AS TRUSTE.

In [26]:
sa.shape

(43055, 25)

(6305083, 28)

In [10]:
reo

Unnamed: 0,GRANTEE1,stdgr1,finalgr1
0,"BANK ONE, N.A.",BANK ONE,LOCAL BANKS
1,WELLS FARGO BANK NA,WELLS FARGO,NONLOCAL BANKS
2,"BENEFICIAL OHIO, INC.",BENEFICIAL OHIO INC,FINANCIAL INSTITUTION
3,DEUTSCH BANK TRUST CO.,DEUTSCHE BANK TRS,NONLOCAL BANKS
4,SEC OF HUD,DEPARTMENT OF HOUSING AND URBAN DEV,GSE
5,FEDERAL NATIONAL MORTGAGE ASSOCIATION,FANNIE MAE,GSE
6,TCIC REO 2 LLC,"TCIF REO1,LLC",FINANCIAL INSTITUTION
7,HOMESALES INC,HOMESALES INC OF DELAWARE,FINANCIAL INSTITUTION
8,DEUTSCHE BANK TRUST COMPANY AMERICAS,DEUTSCHE BANK TRS,NONLOCAL BANKS
9,THIRD FEDERAL SAVINGS AND LOAN ASSOCIATION OF ...,THIRD FEDERAL BANK SAVINGS AND LOAN,LOCAL BANKS


In [25]:
purchaser = sa.groupby('scaseno').last().groupby('purchaser').count().sort_values('parcel',ascending=False)[['parcel']]
purchaser = purchaser.rename(columns={'parcel':'purchaser_count'})
sa = pd.merge(sa, purchaser, how='left', left_on='purchaser',right_index=True)
# sa = sa.sort_values('salesdt').groupby('parcel').last()
sa = sa.rename(columns={'salesdt':'date'})
# sa = sa[['date','plaintiff_count']]
# sa['type'] = 'sa'
# sa = sa.reset_index()
sa['purchaser_count'] = sa['purchaser_count'].fillna(0)
sa.columns

Index(['parcel', 'address', 'date', 'saleno', 'sold_amt', 'appraisal',
       'minbid', 'scaseno', 'withdrawn', 'plaintiff', 'sdefend', 'purchaser',
       'paddress', 'rparty', 'raddress', 'attorney', 'descript', 'location',
       'defend_att', 'with', 'status', 'ssold', 'SHF', 'LATITUDE', 'LONGITUDE',
       'purchaser_count'],
      dtype='object')

In [4]:
plaintiffs = sa.groupby('scaseno').last().groupby('plaintiff').count().sort_values('parcel',ascending=False)[['parcel']]
plaintiffs = plaintiffs.rename(columns={'parcel':'plaintiff_count'})
sa = pd.merge(sa, plaintiffs, how='left', left_on='plaintiff',right_index=True)
# sa = sa.sort_values('salesdt').groupby('parcel').last()
sa = sa.rename(columns={'salesdt':'date'})
# sa = sa[['date','plaintiff_count']]
# sa['type'] = 'sa'
# sa = sa.reset_index()
sa.columns

Index(['parcel', 'address', 'date', 'saleno', 'sold_amt', 'appraisal',
       'minbid', 'scaseno', 'withdrawn', 'plaintiff', 'sdefend', 'purchaser',
       'paddress', 'rparty', 'raddress', 'attorney', 'descript', 'location',
       'defend_att', 'with', 'status', 'ssold', 'SHF', 'LATITUDE', 'LONGITUDE',
       'plaintiff_count'],
      dtype='object')

#### Amount of sheriff's auctions
Similar to foreclosure, a property appearing in the list of sheriff's auctions (unfiltered, and since 2000), increases the likelihood that it is vacant, from 6% to ~20%. Similar to foreclosure, about 6,000 vacant properties show up in the sheriff's auction dataset, which goes back farther.

In [5]:
tem = pd.merge(sa.groupby('scaseno').last().groupby('parcel').count(), tci[['parcel','vacant']], how='right', left_index=True, right_on='parcel').fillna(0)
tem = tem[['plaintiff_count','vacant']].groupby('plaintiff_count').agg([sum,len])
tem['percent'] = tem.vacant['sum']/tem.vacant['len']
tem.T

Unnamed: 0,plaintiff_count,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0
vacant,sum,4834.0,3731.0,1142.0,235.0,65.0,18.0,10.0,2.0,1.0
vacant,len,85446.0,21108.0,5112.0,1019.0,298.0,101.0,30.0,16.0,2.0
percent,,0.056574,0.176758,0.223396,0.230618,0.218121,0.178218,0.333333,0.125,0.5


#### Breakdown by SPA

About half of the vacancies in each SPA show up in the sheriff's sales dataset, but unfortunately in many neighborhoods there are plenty of sheriff's sales in these neighborhoods, so its effectiveness as an indicator ranges from 8% to 45% accurate.

Perhaps changing the criteria that we are looking at will help. One possibility is looking at sheriff's sales in the last few years, or perhaps looking at sheriff's sales to banks or other real-estate entities.

In [6]:
spas_sa = tci[tci.parcel.isin(sa.parcel)][['SPA_NAME','vacant']].groupby('SPA_NAME').agg([sum,len])
spas_sa['number_of_parcels'] = tci.groupby('SPA_NAME').count().parcel
spas_sa['number_of_vacancies'] = tci.groupby('SPA_NAME').agg(sum).vacant
spas_sa['percent_true'] = spas_sa.vacant['sum']*1.0/spas_sa.vacant['len']
spas_sa['percent_sheriffs'] = 1.0*spas_sa.vacant['len']/spas_sa.number_of_parcels
spas_sa['vacancy_rate'] = spas_sa['number_of_vacancies']/spas_sa['number_of_parcels']
spas_sa['percent_vacancies_sheriffs'] = spas_sa.vacant['sum']/spas_sa['number_of_vacancies']
spas_sa

Unnamed: 0_level_0,vacant,vacant,number_of_parcels,number_of_vacancies,percent_true,percent_sheriffs,vacancy_rate,percent_vacancies_sheriffs
Unnamed: 0_level_1,sum,len,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SPA_NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bellaire-Puritas,89,1226,5132,151,0.072594,0.238893,0.029423,0.589404
Broadway-Slavic Village,587,2254,7248,1151,0.260426,0.310982,0.158802,0.509991
Brooklyn Centre,72,615,2342,137,0.117073,0.262596,0.058497,0.525547
Buckeye-Shaker Square,167,836,2809,278,0.199761,0.297615,0.098968,0.600719
Buckeye-Woodhill,143,415,1544,260,0.344578,0.268782,0.168394,0.55
Central,27,159,734,68,0.169811,0.216621,0.092643,0.397059
Clark-Fulton,58,586,2146,150,0.098976,0.273066,0.069897,0.386667
Collinwood-Nottingham,282,1313,3697,487,0.214775,0.355153,0.131728,0.579055
Cudell,84,647,2216,155,0.12983,0.291968,0.069946,0.541935
Detroit Shoreway,81,615,2685,191,0.131707,0.22905,0.071136,0.424084


In [23]:
spas_sa = tci[tci.parcel.isin(sa[sa.date>dt.datetime(2011,1,1)].parcel)][['SPA_NAME','vacant']].groupby('SPA_NAME').agg([sum,len])
spas_sa['number_of_parcels'] = tci.groupby('SPA_NAME').count().parcel
spas_sa['number_of_vacancies'] = tci.groupby('SPA_NAME').agg(sum).vacant
spas_sa['percent_true'] = spas_sa.vacant['sum']*1.0/spas_sa.vacant['len']
spas_sa['percent_sheriffs'] = 1.0*spas_sa.vacant['len']/spas_sa.number_of_parcels
spas_sa['vacancy_rate'] = spas_sa['number_of_vacancies']/spas_sa['number_of_parcels']
spas_sa['percent_vacancies_sheriffs'] = spas_sa.vacant['sum']/spas_sa['number_of_vacancies']
spas_sa

Unnamed: 0_level_0,vacant,vacant,number_of_parcels,number_of_vacancies,percent_true,percent_sheriffs,vacancy_rate,percent_vacancies_sheriffs
Unnamed: 0_level_1,sum,len,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SPA_NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bellaire-Puritas,67,435,5132,151,0.154023,0.084762,0.029423,0.443709
Broadway-Slavic Village,216,616,7248,1151,0.350649,0.084989,0.158802,0.187663
Brooklyn Centre,39,204,2342,137,0.191176,0.087105,0.058497,0.284672
Buckeye-Shaker Square,65,260,2809,278,0.25,0.09256,0.098968,0.233813
Buckeye-Woodhill,45,106,1544,260,0.424528,0.068653,0.168394,0.173077
Central,16,99,734,68,0.161616,0.134877,0.092643,0.235294
Clark-Fulton,24,176,2146,150,0.136364,0.082013,0.069897,0.16
Collinwood-Nottingham,103,367,3697,487,0.280654,0.09927,0.131728,0.211499
Cudell,48,201,2216,155,0.238806,0.090704,0.069946,0.309677
Detroit Shoreway,33,181,2685,191,0.18232,0.067412,0.071136,0.172775


#### Plaintiffs
Unlike the foreclosure dataset, we have access to plaintiff's names. So let's take a look at repeated plaintiffs, as they are likely to be banks or larger entities.

In [59]:
spas = tci[tci.parcel.isin(sa[(sa.plaintiff_count>-1)&(sa.date>dt.datetime(2011,1,1))].parcel)][['vacant']]
number_of_parcels = len(tci)
number_of_vacancies = sum(tci.vacant)
percent_true = sum(spas.vacant)*1.0/len(spas)
vacancy_rate = number_of_vacancies/number_of_parcels
percent_vacancies_sheriffs = sum(spas.vacant)/number_of_vacancies

In [60]:
percent_true,percent_vacancies_sheriffs

(0.23582257158899494, 0.20920502092050208)

In [55]:
spas = tci[tci.parcel.isin(sa[(sa.plaintiff.isin(plaintiffs.iloc[0:30].index))&(sa.date>dt.datetime(2011,1,1))].parcel)][['vacant']]
number_of_parcels = len(tci)
number_of_vacancies = sum(tci.vacant)
percent_true = sum(spas.vacant)*1.0/len(spas)
vacancy_rate = number_of_vacancies/number_of_parcels
percent_vacancies_sheriffs = sum(spas.vacant)/number_of_vacancies

In [57]:
percent_true,percent_vacancies_sheriffs

(0.26143790849673204, 0.091651723450886624)

So, although it does increase the accuracy a few percentage points, it halves the number of vacancies that we are identifying. Perhaps additional filters need to be included?

In [8]:
for i in set(sa['with']):
    tem = tci[tci.parcel.isin(sa[sa['with']==i].parcel)]
    print(i, sum(tem.vacant), len(tem.vacant))
#     print(i)

nan 0 0
WITHD 1364 5720
MINBI 65 335
VACAT 0 1
ORDER 227 1142
NOBID 63 248
NOTSO 123 437
SOLD- 397 3101
BANKR 275 1587
.Â  1 3
PLAIN 1 2
SALEV 9 39
. 0 1


In [64]:
sa.columns

Index(['parcel', 'address', 'date', 'saleno', 'sold_amt', 'appraisal',
       'minbid', 'scaseno', 'withdrawn', 'plaintiff', 'sdefend', 'purchaser',
       'paddress', 'rparty', 'raddress', 'attorney', 'descript', 'location',
       'defend_att', 'with', 'status', 'ssold', 'SHF', 'LATITUDE', 'LONGITUDE',
       'plaintiff_count'],
      dtype='object')

In [63]:
spas_sa = tci[tci.parcel.isin(sa[(sa.plaintiff_count>-1)&(sa.date>dt.datetime(2011,1,1))].parcel)][['SPA_NAME','vacant']].groupby('SPA_NAME').agg([sum,len])
spas_sa['number_of_parcels'] = tci.groupby('SPA_NAME').count().parcel
spas_sa['number_of_vacancies'] = tci.groupby('SPA_NAME').agg(sum).vacant
spas_sa['percent_true'] = spas_sa.vacant['sum']*1.0/spas_sa.vacant['len']
spas_sa['percent_sheriffs'] = 1.0*spas_sa.vacant['len']/spas_sa.number_of_parcels
spas_sa['vacancy_rate'] = spas_sa['number_of_vacancies']/spas_sa['number_of_parcels']
spas_sa['percent_vacancies_sheriffs'] = spas_sa.vacant['sum']/spas_sa['number_of_vacancies']
spas_sa

Unnamed: 0_level_0,vacant,vacant,number_of_parcels,number_of_vacancies,percent_true,percent_sheriffs,vacancy_rate,percent_vacancies_sheriffs
Unnamed: 0_level_1,sum,len,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SPA_NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bellaire-Puritas,67,435,5132,151,0.154023,0.084762,0.029423,0.443709
Broadway-Slavic Village,216,616,7248,1151,0.350649,0.084989,0.158802,0.187663
Brooklyn Centre,38,203,2342,137,0.187192,0.086678,0.058497,0.277372
Buckeye-Shaker Square,65,260,2809,278,0.25,0.09256,0.098968,0.233813
Buckeye-Woodhill,45,106,1544,260,0.424528,0.068653,0.168394,0.173077
Central,16,99,734,68,0.161616,0.134877,0.092643,0.235294
Clark-Fulton,24,176,2146,150,0.136364,0.082013,0.069897,0.16
Collinwood-Nottingham,103,367,3697,487,0.280654,0.09927,0.131728,0.211499
Cudell,48,201,2216,155,0.238806,0.090704,0.069946,0.309677
Detroit Shoreway,33,181,2685,191,0.18232,0.067412,0.071136,0.172775


In [62]:
spas_sa = tci[tci.parcel.isin(sa[(sa.plaintiff_count>50)&(sa.date>dt.datetime(2011,1,1))].parcel)][['SPA_NAME','vacant']].groupby('SPA_NAME').agg([sum,len])
spas_sa['number_of_parcels'] = tci.groupby('SPA_NAME').count().parcel
spas_sa['number_of_vacancies'] = tci.groupby('SPA_NAME').agg(sum).vacant
spas_sa['percent_true'] = spas_sa.vacant['sum']*1.0/spas_sa.vacant['len']
spas_sa['percent_sheriffs'] = 1.0*spas_sa.vacant['len']/spas_sa.number_of_parcels
spas_sa['vacancy_rate'] = spas_sa['number_of_vacancies']/spas_sa['number_of_parcels']
spas_sa['percent_vacancies_sheriffs'] = spas_sa.vacant['sum']/spas_sa['number_of_vacancies']
spas_sa

Unnamed: 0_level_0,vacant,vacant,number_of_parcels,number_of_vacancies,percent_true,percent_sheriffs,vacancy_rate,percent_vacancies_sheriffs
Unnamed: 0_level_1,sum,len,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SPA_NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bellaire-Puritas,41,234,5132,151,0.175214,0.045596,0.029423,0.271523
Broadway-Slavic Village,149,400,7248,1151,0.3725,0.055188,0.158802,0.129453
Brooklyn Centre,22,115,2342,137,0.191304,0.049103,0.058497,0.160584
Buckeye-Shaker Square,42,161,2809,278,0.26087,0.057316,0.098968,0.151079
Buckeye-Woodhill,28,63,1544,260,0.444444,0.040803,0.168394,0.107692
Central,10,65,734,68,0.153846,0.088556,0.092643,0.147059
Clark-Fulton,17,99,2146,150,0.171717,0.046132,0.069897,0.113333
Collinwood-Nottingham,65,228,3697,487,0.285088,0.061672,0.131728,0.13347
Cudell,30,119,2216,155,0.252101,0.0537,0.069946,0.193548
Detroit Shoreway,22,101,2685,191,0.217822,0.037616,0.071136,0.115183


In [8]:
t = tci[tci.parcel.isin(sa[(sa.date > dt.datetime(2013,1,1))&(sa.status=='FORFEITED TO PLTF TAX CERT')].parcel)].vacant
sum(t), len(t)

(145, 463)

In [None]:
sa.groupby('status').count()

In [None]:
al = pd.read_csv(path+'/data/clean_data/armslength.csv',dtype=str)
al = al.drop_duplicates()

months = dict(zip(['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC'],range(1,13)))

def parse_date(x):
    day = int(x[0:2])
    month = months[x[2:5]]
    year = int(x[5:])
    return dt.datetime(year,month,day)

al['date'] = al.mdate.apply(parse_date)
# al = al[al.date < dt.datetime(2015,6,1)]

al = al.rename(columns={'PROPERTY_NUMBER':'parcel'})
al = al[['date','parcel']]
al['type'] = 'al'
al['plaintiff_count'] = 0

In [33]:
t = pd.read_csv(path+'/data/clean_data/transfers.csv', parse_dates=['mdate'], dtype=str)

# t = t[['PROPERTY_NUMBER','mdate']].rename(columns={'PROPERTY_NUMBER':'parcel','mdate':'date'})
# t['type'] = 't'
# t['REO'] = 0

In [42]:
d

(29638, 25)

In [45]:
temp = pd.merge(t, reo, how='left',left_on='GRANTEE1', right_index=True)

In [50]:
temp[(temp.DEED_TYPE.isin(['Sheriffs Deed',
 'Sheriffs Deed Ex'])) & (temp.finalgr1.isnull())]

Unnamed: 0,TRANSFER_TO_ID,WHS_ID,TRANSFER_HISTORY_ID,TRANSFER_ORDER,PARCEL_ID,PROPERTY_NUMBER,GRANTEE1,GRANTOR1,mdate,SALES_AMOUNT,...,GRANTOR_IS_MORTGAGEE,GRANTOR_IS_RELATIVE,TRADE,TRANSFER_TYPE_DESCR,SALES_SOURCE_DESCR,EXEMPTCODE_DESCR,TRANSFER_FROM_PARCEL,UPDATE_DATE,stdgr1,finalgr1
87,600392350,4282080,1750048626,7,102020,001-02-020,"RUSSELL, ROBERT",LISA BENNETT,2013-06-10,175000.0,...,0.0,0.0,0.0,Change Ownership,,,,07NOV2014,,
329,966105061,4292846,1995885836,15,107031,001-07-031,"CROCK REAL ESTATE INVESTMENTS III, LLC",R & B Properties LTd,2012-09-28,16667.0,...,0.0,0.0,0.0,Change Ownership,,,,07NOV2014,,
416,2141904059,4295038,1099970808,3,107081,001-07-081,Certified Housing Solutions LLC,"Montgomery, Robert E.",2004-02-17,19300.0,...,0.0,0.0,0.0,Change Ownership,,,,07NOV2014,,
432,581378823,4297212,344996951,6,107086,001-07-086,"DIOUISOPOULOS, NICK","Cardona, Marco A.",2003-09-10,59000.0,...,0.0,0.0,0.0,Change Ownership,Buyer,,,07NOV2014,,
489,1577492875,4299408,683803882,5,108048,001-08-048,"MTGLQ INVESTORS, L.P","Pigott, Roger",2008-01-14,30000.0,...,0.0,0.0,0.0,Change Ownership,,,,07NOV2014,,
541,349611494,4301557,848321985,7,108069,001-08-069,CUYAHOGA COUNTY LAND REUTILIZATION CORPORATION,"FRICKE, MICHAEL",2014-05-15,0.0,...,0.0,0.0,0.0,Change Ownership,,USA / State/ Political Subdiv,,07NOV2014,,
549,512444070,4301560,510552324,9,108072,001-08-072,Bank One Nat'L Association,"Bond, Charles V.",2002-09-12,44000.0,...,0.0,0.0,0.0,Change Ownership,,,,07NOV2014,,
624,118140021,4303734,1480650244,7,108103,001-08-103,UBER AFFORDABLE HOMES LTD,"TURNER, MILLICENT K.",2013-05-15,15000.0,...,0.0,0.0,0.0,Change Ownership,,,,07NOV2014,,
684,356651321,4305904,1075700438,10,108134,001-08-134,VERTICAL MORTGAGE FUND I. LLC,SUPREME SYNDICATE LLC,2013-01-14,25000.0,...,0.0,0.0,0.0,Change Ownership,,,,07NOV2014,,
696,1632352795,4305911,1498975322,6,108141,001-08-141,CUYAHOGA COUNTY LAND REUTILIZATION CORPORATION,"ROBINSON, LISA A DAVIDSON",2014-11-18,0.0,...,0.0,0.0,0.0,Change Ownership,,USA / State/ Political Subdiv,,21NOV2014,,


In [102]:
fc2 = fc[['date','parcel']]
fc2['type'] = 'fc'
fc2['REO'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [103]:
sa2 = sa[['date','REO','parcel','sold_amt']]
sa2['type'] = 'sa'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [9]:
max(fc.date), max(sa.date), max(t.date)

(Timestamp('2015-11-06 00:00:00'),
 Timestamp('2015-12-07 00:00:00'),
 Timestamp('2015-03-18 00:00:00'))

In [51]:
df = fc2.append(sa2.loc[sa2.sold_amt.notnull(),['date','REO','parcel','type']]).append(t)
df = df.sort_values('date')
df = df.loc[df.apply(lambda x: x.date < dates[x.parcel], axis=1)]

NameError: name 'fc2' is not defined

In [105]:
last = df.groupby('parcel').last()
sub = last[(last['type'] == 'sa')]#&(last['plaintiff_count']>-1)]

In [116]:
t = tci[tci.parcel.isin(sa[sa.REO].parcel)].vacant
sum(t),len(t)

(3617, 20438)

In [106]:
test = set(last[(last['type'] == 'sa') & (last['REO']==True)].index)
sum(tci[tci.parcel.isin(test)].vacant), len(tci[tci.parcel.isin(test)].vacant)

(350, 907)

In [61]:
test = set(sub.index)
sum(tci[tci.parcel.isin(test)].vacant), len(tci[tci.parcel.isin(test)].vacant)

(400, 1074)

In [122]:
tci[tci.parcel=='132-01-031']

Unnamed: 0,PIN,Date,USE_CLASS,USE_DESC,LUC,LUC_Description,Generalized Land Use,Ward,Label,SPA_NAME_x,...,Notes,Image,vacant,parcel,v_total_1yr,v_total_2yr,c_ovv_2yr,c_ovv_1yr,NAME10,SPA_NAME_y
91231,13201031,2015-09-21 14:37:00,R,RESIDENTIAL,5100,1-FAMILY PLATTED LOT,Residential,12,12J,Broadway-Slavic Village,...,,https://wdwot.s3.amazonaws.com/blextoid/56004e...,0,132-01-031,0,0,0,0,1152,Broadway-Slavic Village


In [121]:
fc[fc.parcel=='132-01-031']

Unnamed: 0,caseno,parcel,status,date,defendant,parcel_address,dateid,case_title,plaintiff,FORE,LATITUDE,LONGITUDE,zip_fore
22386,CV-10-720212,132-01-031,Inactive,2010-03-04,"LAURA E. STEWART, ET AL",6215 KENYON AVENUE,11/06/2015,,.,1,41.461606,-81.646362,44105
30661,CV-12-783415,132-01-031,Inactive,2012-05-24,"LAURA E. STEWART, ET AL",6215 KENYON AVENUE,11/06/2015,,.,1,41.461606,-81.646362,44105
39003,CV-15-841186,132-01-031,Inactive,2015-02-27,"LAURA E. STEWART, ET AL",6215 KENYON AVENUE,11/06/2015,,.,1,41.461606,-81.646362,44105


In [119]:
df[df.parcel=='132-01-031']

Unnamed: 0,REO,date,parcel,type
32758,1,2004-01-05,132-01-031,sa
169227,0,2004-05-04,132-01-031,t
169228,0,2006-02-28,132-01-031,t
22386,0,2010-03-04,132-01-031,fc
30661,0,2012-05-24,132-01-031,fc
39003,0,2015-02-27,132-01-031,fc
32759,1,2015-09-21,132-01-031,sa


In [97]:
import re


def find_REO(s):
    if isinstance(s,str):
        s = s.lower()
        if re.search("llc", s):
            return True
        if re.search("bank", s):
            return True
        if re.search("mortg", s):
            return True
        if re.search("mort.", s):
            return True
        if re.search("comp", s):
            return True
        if re.search("corp", s):
            return True
        if re.search("fannie", s):
            return True
        if re.search("housing", s):
            return True
        if re.search("sec.", s):
            return True
        if re.search("loan", s):
            return True
        return False
    else:
        return False

In [90]:
sa.groupby('purchaser').count().sort_values('date',ascending=False)

Unnamed: 0_level_0,parcel,address,date,saleno,sold_amt,appraisal,minbid,scaseno,withdrawn,plaintiff,...,descript,location,defend_att,with,status,ssold,SHF,LATITUDE,LONGITUDE,purchaser_count
purchaser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N/a,2419,2419,2419,2419,2,2418,2417,2419,2419,2419,...,2410,2419,2419,2419,0,2419,2419,2418,2418,2419
FEDERAL NATIONAL MORTGAGE ASSOCIATION,1601,1601,1601,1601,1600,1478,1478,1601,531,1601,...,1598,951,530,531,420,1601,1601,1601,1601,1601
FEDERAL HOME LOAN MORTGAGE CORPORATION,902,902,902,902,902,820,820,902,328,902,...,897,516,327,328,188,902,902,900,900,902
SECRETARY OF HOUSING AND URBAN DEVELOPMENT,447,446,447,447,447,196,196,447,2,447,...,443,3,0,2,0,447,447,447,447,447
THIRD FEDERAL SAVINGS AND LOAN ASSOCIATION OF CLEVELAND,400,400,400,400,400,340,340,400,92,400,...,399,191,91,92,99,400,400,400,400,400
"FANNIE MAE, AKA, FEDERAL NATIONAL MORTGAGE ASSOCIATION, ITS SUCCESSORS AND ASSIGNS",287,287,287,287,286,287,287,287,184,287,...,287,265,184,184,81,287,287,287,287,287
"CITIMORTGAGE, INC.",193,193,193,193,193,188,188,193,55,193,...,193,100,55,55,45,193,193,193,193,193
"WELLS FARGO BANK, NA",189,189,189,189,189,185,185,189,70,189,...,188,142,70,70,72,189,189,189,189,189
FANNIE MAE AKA FEDERAL NATIONAL MORTGAGE ASSOCIATION,186,186,186,186,186,138,138,186,1,186,...,185,0,0,1,0,186,186,186,186,186
"WELLS FARGO BANK, N.A.",186,186,186,186,186,152,152,186,13,186,...,185,43,13,13,30,186,186,186,186,186


In [100]:
sa['REO'] = sa.purchaser.apply(find_REO)

In [110]:
len(sa[sa.purchaser.notnull()])-sum(sa.REO)

6662

In [108]:
sum(sa.REO)

22976

In [12]:
spas_sa = tci[tci.parcel.isin(test)][['SPA_NAME','vacant']].groupby('SPA_NAME').agg([sum,len])
spas_sa['percent'] = spas_sa.vacant['sum']*1.0/spas_sa.vacant['len']
print(spas_sa.sort_values('percent'))

                        vacant        percent
                           sum  len          
SPA_NAME                                     
University                   1   11  0.090909
Edgewater                    1    9  0.111111
Tremont                      4   24  0.166667
Jefferson                   25  143  0.174825
Clark-Fulton                10   55  0.181818
Detroit Shoreway            11   60  0.183333
Kamm's                      20   95  0.210526
Ohio City                    5   23  0.217391
Stockyards                  14   61  0.229508
Old Brooklyn                44  191  0.230366
Lee-Harvard                 36  149  0.241611
North Shore Collinwood      40  157  0.254777
West Boulevard              36  136  0.264706
Central                      9   33  0.272727
Bellaire-Puritas            31  109  0.284404
Euclid-Green                23   78  0.294872
Fairfax                     18   59  0.305085
Goodrich-Kirtland Pk         4   13  0.307692
Lee-Seville                 29   9

In [11]:
for i in spas_sa.index:
    a = sa[sa.parcel.isin(test.intersection(tci[tci.SPA_NAME==i].parcel))]
    spas_sa.loc[spas_sa.index==i,'median_sa'] = a.sort_values('date')['date'].iloc[round(len(a)/2)]
#     print(i,a.sort_values('date')['date'].iloc[round(len(a)/2)])
#     print(np.median(sa[sa.parcel.isin(test.intersection(tci[tci.SPA_NAME==i].parcel))].date))

NameError: name 'spas_sa' is not defined

In [None]:
spas_sa

In [None]:
tci[tci.parcel.isin(set(fc.parcel))][['SPA_NAME','vacant']].groupby('SPA_NAME').agg([sum,len])

In [None]:
len(tci.loc[tci.vacant==1,'parcel'][tci.loc[tci.vacant==1,'parcel'].isin(set(sa.parcel))])

In [None]:
sum(tci['vacant']), len(tci)

In [None]:
sum(tci[tci.parcel.isin(sub)].vacant), len(tci[tci.parcel.isin(sub)].vacant)

In [None]:
last = df.groupby('parcel').last()
sub = set(last[last['type'] == 'sa'].index)

In [None]:
second_last = df.groupby('parcel').nth(-2)
sub2 = set(second_last[second_last['type']=='fc'].index)

In [None]:
len(sub.intersection(sub2)), len(sub), len(sub2)

In [None]:
fc_to_sa = sub.intersection(sub2)

In [None]:
sum(tci[tci.parcel.isin(fc_to_sa)].vacant), len(tci[tci.parcel.isin(fc_to_sa)].vacant)

In [None]:
df[df.parcel=='137-25-057']

In [None]:
a = second_last[second_last.index.isin(sub)]

In [None]:
pitt_vacant = set(second_last[second_last.index.isin(sub)].index)

In [None]:
sum(tci[tci.parcel.isin(set(a.index))].vacant), len(tci[tci.parcel.isin(set(a.index))].vacant)

In [None]:
sum(tci[(tci.parcel.isin(pitt_vacant))&(tci.parcel.isin(pitt))].vacant), len(tci[tci.parcel.isin(pitt_vacant)].vacant)

In [None]:
sum(tci.vacant)

In [None]:
1334.0/ 3892

In [None]:
temp = df[df.parcel.isin(pitt_vacant)].groupby('parcel').agg(lambda x: (x.date.iloc[-1]-x.date.iloc[-2]).days)

In [None]:
np.median(temp.date)

In [None]:
sns.distplot(temp.date);

In [None]:
pv = pd.read_csv(path+'/data/clean_data/postal_vacancy.csv', parse_dates=3, index_col=0)
pv.date = pv.date.apply(lambda x: dt.datetime(int(x[0:4]),int(x[5:7]),int(x[8:10])))
pv = pv.sort_values('date',ascending=False)
pv = pv.loc[pv.apply(lambda x: x.date < dates[x.PARCEL], axis=1)]

In [None]:
pv = pv[pv.vindall=='Y']
pv_copy = pv.copy().groupby('PARCEL').count().reset_index()

pv_copy['pv_count'] = pv_copy.apply(lambda x: \
        len(pv[(pv.PARCEL == x.PARCEL) & (pv.date > dates[x.PARCEL]-pd.DateOffset(years=2))]), axis=1)

In [None]:
pv2 = pv.groupby('PARCEL').last()

In [None]:
tci[tci.parcel.isin(pv2[pv2.vindall=='Y'].index)].groupby('vacant').count()

In [None]:
sns.distplot(df2[df2.vacant==1].date,bins=np.linspace(0,30,30));
sns.distplot(df2[df2.vacant==0].date,bins=np.linspace(0,30,30));

In [None]:
df2 = pd.merge(df.groupby('parcel').last(), tci[['vacant','ppn']], how='left',left_index=True, right_on='ppn')

In [None]:
df2[['type','vacant']].groupby('type').agg([sum,len])

In [None]:
d = df2[df2['type']=='fc']
d['year'] = d.date.apply(lambda x: x.year)

In [None]:
d[['year','vacant']].groupby('year').agg([sum,len])

In [None]:
plt.plot(fc.set_index('date').groupby(['type']).resample('m', 'count'))

In [None]:
t = tci[tci.ppn.isin(fc[fc.date > dt.datetime(2015,3,1)].parcel)].vacant

In [None]:
sum(t),len(t)

In [None]:
fc['year'] = fc['date']