# Filter data used in vacancy model based on TCI parcel numbers

Parcels surveyed in Summer 2015 so all data pulled should come from before that. Goal of script/notebook is to filter datasets by the parcel numbers in the TCI survey, although we will filter again based on existence of structure.

In [1]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime as dt
%matplotlib inline

path = '/'.join(os.getcwd().split('/')[:-1])
print(path)

/Volumes/Dropbox/largetransfer/luc/carter


In [2]:
# tci = pd.read_excel(path+'/data/inspection_data/Cleveland_Final_Results_Table_FOR_DISTRIBUTION_20151111.xlsx', encoding="ISO-8859-1") 

# def get_vacant(x):
#     if x == 'Occupied Structure':
#         return 0
#     elif x == 'Vacant Structure':
#         return 1
#     else: 
#         return -1
    
# tci['vacant'] = tci['Survey Category'].apply(get_vacant)
# tci['parcel'] = tci.PIN.apply(lambda x: x[0:3]+'-'+x[3:5]+'-'+x[5:])

# tci[(tci.USE_CLASS=='R') & (tci.vacant>-1)].to_csv(path+'/data/model_data/tci_1_0.csv', index=False)

In [3]:
tci = pd.read_csv(path+'/data/model_data/tci_1_0.csv', parse_dates=['Date'], dtype={'Ward':object,'PIN':str})
ppns = set(tci[(tci.USE_CLASS=='R') & (tci.vacant>-1)].parcel)

### Demographic data

In [14]:
tracts.shape

(177527, 43)

In [13]:
tracts = pd.read_csv(path+'/data/original_data/clv_par_census.csv')
demo = pd.read_csv(path+'/data/original_data/sociodemographic_Data.csv')

tracts = pd.merge(tracts, demo, left_on='NAME10', right_on='Census Tract', how='left')
cols = [0,1,5,7,9,11,13,14,15,16,17,18,20,22,24,26,28,30,32,34,36,38,40]
tracts.iloc[:,cols].to_csv(path+'/data/clean_data/demographic.csv',index=False)

## Property characteristics

In [48]:
# infile = path+'/data/original_data/main_prop.csv'
# outfile = path+'/data/clean_data/main_prop13.csv'

# with open(infile, 'r', encoding="ISO-8859-1") as fin, open(outfile, 'w') as fout:
#     write_to = csv.writer(fout, lineterminator='\n')
#     header = next(csv.reader(fin))
#     write_to.writerow(header)
#     for row in csv.reader(fin):
#         if row[0] in ppns:
#             write_to.writerow(row)

In [5]:
# commented code only needs to be run once

main = pd.read_csv(path+'/data/clean_data/main_prop13.csv', dtype=object)

main = main.drop_duplicates()
main = main[main.parcel.isin(ppns)]

main14 = main[main.taxyr=='2014'].groupby('parcel').first().reset_index()
main13 = main[main.taxyr=='2013'].groupby('parcel').first().reset_index()

print(main13.shape, main14.shape)

(113092, 65) (113117, 65)


In [129]:
main13 = main13.set_index('parcel')
main13 = main13.rename(columns={'condition':'condition13'})
main14 = main14.rename(columns={'condition':'condition14'})
pd.merge(main14, main13[['condition13']], how='left', left_on='parcel', right_index=True)\
    .to_csv(path+'/data/clean_data/main_prop_filtered.csv')

## Residential characteristics 
Filename: ```res2013.csv```

In [130]:
# first part only needs to be run once

res = pd.read_csv(path+'/data/original_data/res/res2013.csv')
res = res[res.parcel.isin(ppns)]
res.to_csv(path+'/data/clean_data/res.csv', index=False)

# res = pd.read_csv(path+'/data/clean_data/res.csv')

## Tax bill
Filename: ```dec14_tci.csv```

In [9]:
##only needs to be run once to clean taxbill data

infile = path+'/data/original_data/taxbill/may15.csv'
outfile = path+'/data/clean_data/taxbill_may15.csv'

with open(infile, 'r') as fin, open(outfile, 'w') as fout:
    write_to = csv.writer(fout, lineterminator='\n')
    header = next(csv.reader(fin))
    write_to.writerow(header)
    for row in csv.reader(fin):
        if row[5] in ppns:
            write_to.writerow(row)

In [11]:
##only needs to be run once to clean taxbill data

infile = path+'/data/original_data/taxbill/sep14.csv'
outfile = path+'/data/clean_data/taxbill_sep14.csv'

with open(infile, 'r') as fin, open(outfile, 'w') as fout:
    write_to = csv.writer(fout, lineterminator='\n')
    header = next(csv.reader(fin))
    write_to.writerow(header)
    for row in csv.reader(fin):
        if row[5] in ppns:
            write_to.writerow(row)

## County land bank
Filename: ```count_land_bank.csv```

In [99]:
lb = pd.read_csv(path+'/data/original_data/count_land_bank.csv', parse_dates=[3,4])

lb = lb[lb['parcel'].isin(ppns)]
# lb = lb[lb['acq_dt']<np.datetime64('2015-06-01')]

lb.to_csv(path+'/data/clean_data/county_lb.csv', index=False)

## Foreclosure filings

In [98]:
fc = pd.read_csv(path+'/data/original_data/foreclosure_filings2006_beyond.csv', parse_dates = [2])

fc = fc[fc['parcel'].isin(ppns)]
# fc = fc[fc['filedate']<np.datetime64('2015-06-01')]

fc.to_csv(path+'/data/clean_data/foreclosure_filings2.csv',index=False)

In [184]:
# fc2 = pd.read_csv(path+'/data/original_data/foreclosure_filings2006_dec2014.csv', parse_dates = [2])

## Sheriff auction
Filename: ```shf_aution_mar2000_dec2014.csv```

In [97]:
sa = pd.read_csv(path+'/data/original_data/shf_aution_mar2000_dec2014.csv', parse_dates=[2], encoding="ISO-8859-1")

sa = sa[sa.parcel.isin(ppns)]
# sa = sa[sa.salesdt<np.datetime64('2015-06-01')]

sa.to_csv(path+'/data/clean_data/sheriff_auction.csv', index=False)

## Transfers
Filename: ```transfers2000_2014.csv```

In [90]:
infile = path+'/data/original_data/transfers2000_2014.csv'
outfile = path+'/data/clean_data/transfers.csv'

with open(infile, 'r') as fin, open(outfile, 'w') as fout:
    write_to = csv.writer(fout, lineterminator='\n')
    header = next(csv.reader(fin))
    write_to.writerow(header)
    for row in csv.reader(fin):
        if row[5] in ppns:
            write_to.writerow(row)

In [96]:
max(tf.mdate)

Timestamp('2015-03-18 00:00:00')

In [92]:
# tf = pd.read_csv(path+'/data/clean_data/transfers.csv',parse_dates=[8], dtype='str')
# tf = tf[tf.mdate<np.datetime64('2015-06-01')]

# tf.to_csv(path+'/data/clean_data/transfers.csv',index=False)

## Armslength sales
Filename: ```armslengthsales2006_2014.csv```

In [100]:
al = pd.read_csv(path+'/data/original_data/armslengthsales2006_beyond.csv', dtype=str)
al = al[al.PROPERTY_NUMBER.isin(ppns)]
al.to_csv(path+'/data/clean_data/armslength.csv',index=False)

# al = pd.read_csv(path+'/data/clean_data/armslength.csv', parse_dates=['mdate'])

## Violations
Filename: ```violate_cle.csv```


In [101]:
infile = path+'/data/original_data/violate_cle.csv'
outfile = path+'/data/clean_data/violations.csv'

v = pd.read_csv(infile, dtype=str)
v = v[v.parcel.isin(ppns)]
# v = v[(v.v_file_date < np.datetime64('2015-06-01')) & (v.v_file_date > np.datetime64('2006-06-01'))]

v.to_csv(path+'/data/clean_data/violations.csv',index=False)

## Complaints
Filename: ```complaint_cle.csv```

In [102]:
c = pd.read_csv(path+'/data/original_data/complaint_cle.csv', dtype=str)

c = c[c.parcel.isin(ppns)]
# c = c[(c.c_file_date < np.datetime64('2015-06-01')) & (c.c_file_date > np.datetime64('2006-06-01'))]

c.to_csv(path+'/data/clean_data/complaints.csv')

## Postal data
Filenames: ```pv201302.csv, pv201304.csv, pv201308.csv, pv201312.csv, pv201402.csv```

In [26]:
len(set(p.PARCEL))

14459

In [23]:
p = pd.DataFrame()
for pv in os.listdir(path+'/data/original_data/postal/'):
    pos = pd.read_csv(path+'/data/original_data/postal/' + pv)
    pos['date'] = dt.datetime(int(pv[2:6]), int(pv[6:8]), 1)
    p = p.append(pos)
p = p[p.PARCEL.isin(ppns)]
p.to_csv(path+'/data/clean_data/postal_vacancy.csv')