# Generate data on TCI Parcels

Parcels surveyed in March, 2014 so all data pulled should come from before that.

In [9]:
import pandas as pd
import csv
import numpy as np

## TCI surveyed parcels
* PPN
* Vacant or not

In [10]:
tci = pd.read_csv('original_data/tci.csv')
ppns = set(tci['Parcel Number'])

def vacant(x):
    if x in ['Vacant Structure Open', 'Vacant Structure Secure']:
        return 1
    else:
        return 0
    
tci['vacant'] = tci.Category.apply(vacant)
# tci.to_csv('clean_data/tci.csv', index=False)

In [3]:
sum(tci.vacant), len(tci)

(1555, 13500)

## Residential characteristics 
Filename: ```res2013.csv```

In [4]:
res = pd.read_csv('original_data/res2013.csv')

In [5]:
res = res[res.parcel.isin(ppns)]

res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10807 entries, 83077 to 117630
Data columns (total 48 columns):
PROPERTY_NUMBER     10807 non-null object
bldgrecnum          10807 non-null int64
linkid              10807 non-null int64
WHS_ID              10807 non-null int64
occup               10807 non-null object
STYLE               10807 non-null object
rnumstor            10807 non-null float64
cqual               10807 non-null object
ryrbuilt            10807 non-null float64
eyrbuilt            10806 non-null float64
condtion            10807 non-null object
rextwall            10807 non-null object
rrooftyp            10785 non-null object
roofmat             10799 non-null object
rbasetyp            10807 non-null object
basesqft            10769 non-null float64
BASEMENT_FINISHD    9098 non-null float64
heat                10807 non-null object
air                 10807 non-null object
attic               10807 non-null object
ROOMS               10805 non-null float64
BE

We'll take these attributes into our dataset.

In [14]:
cols = ['STYLE','parcel','ryrbuilt','eyrbuilt','occup','condtion','rextwall','heat','air','plumbfix','livatot','cqual']

In [15]:
res[cols].head()

Unnamed: 0,STYLE,parcel,ryrbuilt,eyrbuilt,occup,condtion,rextwall,heat,air,plumbfix,livatot,cqual
83077,Colonial,121-31-015,1897,1956,1 family,Average,Aluminum/Vinyl (r),Forced air,,5,1344,Average
83079,Colonial,121-31-019,1913,1966,2 family,Good,Frame (r),Forced air,,10,1826,Average
83080,Colonial,121-31-020,1910,1956,2 family,Average,Frame (r),Forced air,,10,2112,Average
83081,Bungalow,121-32-005,1900,1956,1 family,Average,Frame (r),Forced air,,5,1256,Average+
83082,Ranch,121-32-008,1954,1926,1 family,Very poor,Concrete block,Forced air,,5,832,Average+


In [16]:
res[cols].to_csv('clean_data/res_tci.csv',index=False)

##Tax bill
Filename: ```feb14_tci.csv```

In [17]:
infile = 'original_data/feb14.csv'
outfile = 'clean_data/feb14_tci2.csv'

with open(infile, 'r') as fin, open(outfile, 'w') as fout:
    write_to = csv.writer(fout, lineterminator='\n')
    header = next(csv.reader(fin))
    write_to.writerow(header)
    for row in csv.reader(fin):
        if len(row) > 91:
            if row[91] in ppns:
                write_to.writerow(row)

In [26]:
temp = pd.read_csv('clean_data/aug13_tci.csv')

  data = self._reader.read(nrows)


In [58]:
df2 = pd.merge(tci, temp, how="left", left_on='Parcel Number', right_on="PROPERTY_NUMBER")

In [59]:
sum(df2.vacant)*1.0/len(df2.vacant)

0.11518518518518518

In [63]:
sum(df[df2.CLASSIFICATION_ID.isnull()].vacant)*1.0/len(df[df2.CLASSIFICATION_ID.isnull()].vacant)

0.053892215568862277

In [60]:
df2[['CLASSIFICATION_ID','vacant']].groupby('CLASSIFICATION_ID').agg([sum,len])

Unnamed: 0_level_0,vacant,vacant
Unnamed: 0_level_1,sum,len
CLASSIFICATION_ID,Unnamed: 1_level_2,Unnamed: 2_level_2
3700,0,2
4000,0,10
4090,1,7
4200,0,2
4650,0,1
4970,3,8
5000,8,618
5100,677,5720
5200,791,4854
5300,29,181


In [48]:
df2[df2['TOTAL_NET_DELQ_BALANCE']==0].groupby('vacant').count()

Unnamed: 0_level_0,Parcel Number,House Number,Street Name,Category,Survey Date,PPN,AUDITOR_REFUND,BANK_ID,BILL_NUMBER,BILL_TYPE,...,TOTAL_NET_PAID_1ST,TOTAL_NET_PAID_2ND,TRASURER_REFUND,TREASURER_CODE,TWOANDONEHALF_CREDIT_1ST,TWOANDONEHALF_CREDIT_2ND,UPDATE_DATE,VALUE_CHANGE_TIMESTAMP,WHS_ID,DOWNLOAD_DATE
vacant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9663,9234,8958,9663,9663,9663,14,5672,0,0,...,9663,9663,0,196,9663,9663,9663,0,9663,9663
1,589,586,579,589,589,589,0,406,0,0,...,589,589,0,35,589,589,589,0,589,589


In [51]:
df2[df2['TOTAL_NET_DELQ_BALANCE']>4000].groupby('vacant').count()

Unnamed: 0_level_0,Parcel Number,House Number,Street Name,Category,Survey Date,PPN,AUDITOR_REFUND,BANK_ID,BILL_NUMBER,BILL_TYPE,...,TOTAL_NET_PAID_1ST,TOTAL_NET_PAID_2ND,TRASURER_REFUND,TREASURER_CODE,TWOANDONEHALF_CREDIT_1ST,TWOANDONEHALF_CREDIT_2ND,UPDATE_DATE,VALUE_CHANGE_TIMESTAMP,WHS_ID,DOWNLOAD_DATE
vacant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,692,685,681,692,692,692,0,469,0,0,...,692,692,0,43,692,692,692,0,692,692
1,531,531,530,531,531,531,0,393,0,0,...,531,531,0,52,531,531,531,0,531,531


In [27]:
temp.shape

(13333, 149)

In [3]:
tb = pd.read_csv('clean_data/feb14_tci.csv')
tb.shape

(6075, 147)

In [64]:
tb

Unnamed: 0,TAX_BILL_ID,TAX_YEAR,WHS_ID,PROPERTY_ID,PPN,PROPERTY_NUMBER,MACHINE_ID,EFFECTIVE_STATUS,VALUE_CHANGE_TIMESTAMP,PROPERTY_TYPE,...,LOCATION_UNIT_NUMBER,HOMESTEAD_ASSESSED_VALUE,SURPLUS_PAYMENT,OMITTED_TAX_PENALTY_1ST,OMITTED_TAX_PENALTY_2ND,AUDITOR_REFUND,TRASURER_REFUND,TIF_AMOUNT_1ST,TIF_AMOUNT_2ND,UPDATE_DATE
0,265820950,2013,4395608,103296466,12923013,129-23-013,,,,800,...,,,,,,,,0,0,02/02/2014
1,265840406,2013,4411518,503229825,12616034,126-16-034,,,,800,...,,,,,,,,0,0,02/02/2014
2,265859720,2013,4399292,803706873,12131003,121-31-003,,,,800,...,,,,,,,,0,0,02/02/2014
3,265859722,2013,4401622,503706897,12131008,121-31-008,,,,800,...,,,,,,,,0,0,02/02/2014
4,265860587,2013,4408717,503708076,12132124,121-32-124,,,,800,...,,,,,,,,0,0,02/02/2014
5,265860588,2013,4408718,903708080,12132125,121-32-125,,,,800,...,,,,,,,,0,0,02/02/2014
6,265860589,2013,4411018,403708084,12132126,121-32-126,,,,800,...,,,,,,,,0,0,02/02/2014
7,265860590,2013,4411020,303708092,12132128,121-32-128,,,,800,...,,,,,,,,0,0,02/02/2014
8,265860591,2013,4411021,603708095,12132129,121-32-129,,,,800,...,,,,,,,,0,0,02/02/2014
9,265860592,2013,4411022,903708098,12132130,121-32-130,,,,800,...,,,,,,,,0,0,02/02/2014


Notes:
* Property type is 800 for all entries
* Homestead/check mailing addresses for owner occupancy
* TOTAL_NET_DELQ_BALANCE := certified deliquent taxes
* Grand total balance?

In [5]:
df = pd.merge(tci, tb, how="left", left_on='Parcel Number', right_on="PROPERTY_NUMBER")

In [6]:
sub = df.loc[df.TOTAL_NET_DELQ_BALANCE.notnull(),['TOTAL_NET_DELQ_BALANCE','vacant']]

In [7]:
df.loc[df.TOTAL_NET_DELQ_BALANCE.notnull(),['TOTAL_NET_DELQ_BALANCE']].values

array([[    0.  ],
       [ 5169.85],
       [    0.  ],
       ..., 
       [    0.  ],
       [    0.  ],
       [    0.  ]])

In [16]:
sub[sub.TOTAL_NET_DELQ_BALANCE==0].groupby('vacant').count()

Unnamed: 0_level_0,TOTAL_NET_DELQ_BALANCE
vacant,Unnamed: 1_level_1
0,4789
1,157


In [19]:
sub[sub.TOTAL_NET_DELQ_BALANCE>5000].groupby('vacant').count()

Unnamed: 0_level_0,TOTAL_NET_DELQ_BALANCE
vacant,Unnamed: 1_level_1
0,181
1,153


In [22]:
tb[['TOTAL_NET_DELQ_BALANCE','TOTAL_NET_BALANCE_OWED_1ST','TOTAL_NET_BALANCE_OWED_2ND','GRAND_TOTAL_BALANCE']]

Unnamed: 0,TOTAL_NET_DELQ_BALANCE,TOTAL_NET_BALANCE_OWED_1ST,TOTAL_NET_BALANCE_OWED_2ND,GRAND_TOTAL_BALANCE
0,0.00,0.00,113.04,113.04
1,0.00,0.00,655.43,655.43
2,0.00,0.00,28.99,28.99
3,5169.85,577.85,550.32,6298.02
4,0.00,0.00,308.39,308.39
5,0.00,0.00,0.00,0.00
6,0.00,0.00,0.00,0.00
7,0.00,0.00,610.44,610.44
8,484.90,466.91,444.67,1396.48
9,0.00,0.00,526.58,526.58


In [23]:
tb[['CERT_PEND_FLAG','PPN']].groupby('CERT_PEND_FLAG').count()

Unnamed: 0_level_0,PPN
CERT_PEND_FLAG,Unnamed: 1_level_1


In [21]:
tb[tb['TOTAL_NET_DELQ_BALANCE']>0][['TOTAL_NET_DELQ_BALANCE','PROPERTY_NUMBER']].shape

(1128, 2)

In [22]:
cols = ['PROPERTY_NUMBER','LENDER_PROCESS_TYPE','GRAND_TOTAL_BALANCE','TOTAL_NET_DELQ_BALANCE','TOTAL_NET_BALANCE_OWED_1ST',\
        'TOTAL_NET_BALANCE_OWED_2ND','GRAND_TOTAL_BALANCE','FORECLOSURE_FLAG','MAIL_STREET_NUMBER',\
        'MAIL_STREET_NAME','LOCATION_STREET_NO',\
        'LOCATION_STREET_NAME','HOMESTEAD_FLAG']

In [23]:
tb[cols].head()

Unnamed: 0,PROPERTY_NUMBER,LENDER_PROCESS_TYPE,GRAND_TOTAL_BALANCE,TOTAL_NET_DELQ_BALANCE,TOTAL_NET_BALANCE_OWED_1ST,TOTAL_NET_BALANCE_OWED_2ND,GRAND_TOTAL_BALANCE.1,FORECLOSURE_FLAG,MAIL_STREET_NUMBER,MAIL_STREET_NAME,LOCATION_STREET_NO,LOCATION_STREET_NAME,HOMESTEAD_FLAG
0,129-23-013,0.0,113.04,0.0,0.0,113.04,113.04,,2999.0,PAYNE,12008.0,Forest,
1,126-16-034,2500.0,655.43,0.0,0.0,655.43,655.43,,2999.0,PAYNE,10005.0,Cumberland,
2,121-31-003,,28.99,0.0,0.0,28.99,28.99,,2417.0,Woodhill,2427.0,Woodhill,
3,121-31-008,0.0,6298.02,5169.85,577.85,550.32,6298.02,1.0,,Ingersoll,,Ingersoll,
4,121-32-124,1501.0,308.39,0.0,0.0,308.39,308.39,,2399.0,Mapleside,2399.0,Mapleside,


In [24]:
tb[cols].to_csv('clean_data/taxbill_tci.csv', index=False)

## County land bank
Filename: ```count_land_bank.csv```

In [25]:
lb = pd.read_csv('original_data/count_land_bank.csv', parse_dates=[3,4])
lb.columns

Index([u'parcel', u'disp_status', u'p_source', u'acq_dt', u'disp_dt', u'assessment', u'demo_status', u'rehab_status', u'public_status', u's_date', u'cclrc_dba_date', u'cclrc_dc_date', u'ab_proceed_dt', u'proceed_date', u'cclrc_dcp_date', u'out_type'], dtype='object')

In [26]:
lb = lb[lb['parcel'].isin(ppns)]
lb[lb['acq_dt']<np.datetime64('2014-03-01')].to_csv('clean_data/count_land_bank_tci.csv', index=False)

## Foreclosure filings

In [2]:
fc = pd.read_csv('original_data/foreclosure_filings2006_dec2014.csv', parse_dates = [2])

  data = self._reader.read(nrows)


In [54]:
fc = fc[fc['parcel'].isin(ppns)]
fc = fc[fc['filedate']<np.datetime64('2014-03-01')]

In [55]:
len(set(fc.parcel))

4426

In [56]:
fc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5843 entries, 21 to 115643
Data columns (total 26 columns):
parcel            5843 non-null object
status            5843 non-null object
filedate          5843 non-null datetime64[ns]
defendant         5843 non-null object
caseno            5843 non-null object
parcel_address    5823 non-null object
dateid            5843 non-null object
FORE              5843 non-null int64
LATITUDE          5821 non-null float64
LONGITUDE         5821 non-null float64
zip_fore          5676 non-null float64
case_number       5843 non-null object
ctitle            5843 non-null object
cdesig            5843 non-null object
judge             5843 non-null object
magistrate        5143 non-null object
room              17 non-null object
n_action          17 non-null object
f_location        5843 non-null object
lstatus           5843 non-null object
lstatusdt         5843 non-null object
ldispos           5843 non-null object
ldisposdt         5843 non

In [31]:
fc.to_csv('clean_data/foreclosure_filings_tci.csv',index=False)

## Sheriff auction
Filename: ```shf_aution_mar2000_dec2014.csv```

In [3]:
sa = pd.read_csv('original_data/shf_aution_mar2000_dec2014.csv', parse_dates=[2])
sa.shape

(113807, 25)

In [33]:
sa = sa[sa.parcel.isin(ppns)]
sa = sa[sa.salesdt<np.datetime64('2014-03-01')]
sa.shape

(5994, 25)

In [34]:
sa.columns.values

array(['parcel', 'address', 'salesdt', 'saleno', 'sold_amt', 'appraisal',
       'minbid', 'scaseno', 'withdrawn', 'plaintiff', 'sdefend',
       'purchaser', 'paddress', 'rparty', 'raddress', 'attorney',
       'descript', 'location', 'defend_att', 'with', 'status', 'ssold',
       'SHF', 'LATITUDE', 'LONGITUDE'], dtype=object)

In [35]:
sa.loc[sa['ssold'].isnull(),'ssold'] = 'No'

In [36]:
def convert(x):
    if x == 'No':
        return 0
    else:
        return 1
sa['ssold'] = sa['ssold'].apply(convert)

In [37]:
sa.to_csv('clean_data/shr_auction_tci.csv', index=False)

## Transfers
Filename: ```transfers2000_2014.csv```

In [39]:
# infile = 'original_data/transfers2000_2014.csv'
# outfile = 'clean_data/transfers_tci.csv'

# with open(infile, 'r') as fin, open(outfile, 'w') as fout:
#     write_to = csv.writer(fout, lineterminator='\n')
#     header = next(csv.reader(fin))
#     write_to.writerow(header)
#     for row in csv.reader(fin):
#         if row[5] in ppns:
#             write_to.writerow(row)

In [4]:
tf = pd.read_csv('clean_data/transfers_tci.csv',parse_dates=[8])

In [41]:
tf = tf[tf.mdate<np.datetime64('2014-03-01')]

In [42]:
tf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25092 entries, 0 to 26570
Data columns (total 60 columns):
TRANSFER_TO_ID                  25092 non-null int64
WHS_ID                          25092 non-null int64
TRANSFER_HISTORY_ID             25092 non-null int64
TRANSFER_ORDER                  25092 non-null int64
PARCEL_ID                       25092 non-null int64
PROPERTY_NUMBER                 25092 non-null object
GRANTEE1                        25092 non-null object
GRANTOR1                        25059 non-null object
mdate                           25092 non-null datetime64[ns]
SALES_AMOUNT                    25092 non-null int64
DEED_TYPE                       25092 non-null object
instrument_number               20174 non-null float64
RECORDED_BOOK                   4939 non-null float64
RECORDED_PAGE                   4939 non-null object
INSTRUMENT_TYPE                 25092 non-null int64
SALE_VALID                      25092 non-null int64
multiproperty_sale         

In [149]:
tf.to_csv('clean_data/transfers_tci.csv',index=False)

## Armslength sales
Filename: ```armslengthsales2006_2014.csv```

In [2]:
from sas7bdat import SAS7BDAT
with SAS7BDAT('original_data/armslengthprocessed.sas7bdat') as f:
    al = f.to_data_frame()

In [20]:
al[['DEED_TYPE','neighbor','parcel','mdate','TRANSFER_ORDER','convamt']].to_csv('original_data/armslength_tci2.csv', index=False)

In [19]:
al.shape

(309317, 95)

In [14]:
al = pd.read_csv('original_data/armslengthsales2006_2014.csv')

  data = self._reader.read(nrows)


In [4]:
al.columns

Index([u'AMOUNT_PAID', u'ASSUMED_LOAN_AMOUNT', u'AUTO_FILE_NUMBER', u'BUILDINGS_ON_LAND', u'CAMA_INV_FILE', u'CAMA_SALE_FILE', u'CHANGE_TIMESTAMP', u'CONVEYANCE_FEE', u'DEED_TYPE', u'EXEMPTCODE', u'EXEMPTCODE_DESCR', u'FEE_CHECK_NUM', u'FEE_PAID_BY', u'GIFT', u'GRANTEE1', u'GRANTOR1', u'GRANTOR_IS_MORTGAGEE', u'GRANTOR_IS_RELATIVE', u'INSTRUMENT_DATE', u'INSTRUMENT_TYPE', u'LAND_CONTRACT', u'LEASED_FEE', u'LEASE_HOLD', u'LIFE_ESTATE', u'MINERAL_RIGHTS_RESERVED', u'MONTH_OF_SALE', u'MOTHER', u'NUMBER_OF_PROPERTIES_IN_SALE', u'PARTINTER_EST_TRANSFERRED', u'PERMISSIVE_FEE', u'PERSONAL_PROPERTY_AMOUNT', u'PROPERTY_NUMBER', u'RECEIPT_NUMBER', u'RECORDED_BOOK', u'RECORDED_DATE', u'RECORDED_PAGE', u'SALES_AMOUNT', u'SALES_SOURCE', u'SALES_SOURCE_DESCR', u'SALE_VALID', u'TRADE', u'mdate', u'TRANSFER_FEE', u'TRANSFER_FROM_PARCEL', u'TRANSFER_HISTORY_ID', u'TRANSFER_NUMBER', u'TRANSFER_ORDER', u'TRANSFER_STATUS', u'TRANSFER_TO_ID', u'TRANSFER_TYPE', u'TRANSFER_TYPE_DESCR', u'UPDATE_DATE', u'USER

In [5]:
al = al[al.PROPERTY_NUMBER.isin(ppns)]
al.shape

(5497, 86)

In [6]:
cols = ['AMOUNT_PAID','ASSUMED_LOAN_AMOUNT','BUILDINGS_ON_LAND','DEED_TYPE','GIFT','MONTH_OF_SALE','NUMBER_OF_PROPERTIES_IN_SALE', \
        'PROPERTY_NUMBER','SALES_AMOUNT','SALE_VALID','TRANSFER_FEE','YEAR_OF_SALE']
al[cols].head()

Unnamed: 0,AMOUNT_PAID,ASSUMED_LOAN_AMOUNT,BUILDINGS_ON_LAND,DEED_TYPE,GIFT,MONTH_OF_SALE,NUMBER_OF_PROPERTIES_IN_SALE,PROPERTY_NUMBER,SALES_AMOUNT,SALE_VALID,TRANSFER_FEE,YEAR_OF_SALE
302,4.5,0,1,QUIT CLAIM DEED,0,11,1,129-26-066,1000,2,0.5,2008
441,4.5,0,1,WARRANTY DEED,0,8,1,129-16-081,600,2,0.5,2012
554,58.5,0,1,WARRANTY DEED,0,10,1,129-23-164,14500,2,0.5,2012
696,40.5,0,1,WARRANTY DEED,0,10,1,128-13-046,10000,0,0.5,2004
697,100.5,0,1,WARRANTY DEED,0,9,1,129-26-039,25000,0,0.5,2005


In [8]:
al.to_csv('clean_data/armslength_tci.csv',index=False)

## Violations
Filename: ```violate_cle.csv```


In [43]:
v = pd.read_csv('original_data/violate_cle.csv',parse_dates=[1,4])

In [15]:
v.head()

Unnamed: 0,VIN,v_file_date,v_wf_task,v_wf_task_status,v_wf_task_date,v_type_of_violation,parcel1,parcel,cond_gar,cond,other,maint,fire,int_ext,vin_id,violation_issue_date,vn_created,vn_source
0,$$10BCE-0,10/19/2010,Application Acceptance,Violation Notice Rvw Approved,10/19/2010,,,,,,,,,,,,,
1,$$10BCE-0,10/19/2010,Inspection,Violation Resolved,10/19/2010,,,,,,,,,,,,,
2,$$10BCE-0,10/19/2010,Closure,Case Closed,10/19/2010,,,,,,,,,,,,,
3,$$12BCE-0,02/03/2012,Inspection,Awaiting Reinspection,02/02/2012,HVAC,,112-19-032,,,1.0,,,,$$12BCE-0,02/02/2012,1.0,2.0
4,$$12BCE-0,02/03/2012,Application Acceptance,Violation Notice Rvw Approved,02/02/2012,HVAC,,112-19-032,,,1.0,,,,$$12BCE-0,02/02/2012,1.0,2.0


In [44]:
v = v[v.parcel.isin(ppns)]
v = v[(v.v_file_date < np.datetime64('2014-03-01')) & (v.v_file_date > np.datetime64('2006-03-01'))]

In [45]:
print v.shape
print v.columns

(17801, 18)
Index([u'VIN', u'v_file_date', u'v_wf_task', u'v_wf_task_status', u'v_wf_task_date', u'v_type_of_violation', u'parcel1', u'parcel', u'cond_gar', u'cond', u'other', u'maint', u'fire', u'int_ext', u'vin_id', u'violation_issue_date', u'vn_created', u'vn_source'], dtype='object')


In [46]:
# v['total'] = v.groupby('parcel').count()
v.to_csv('clean_data/violations_tci.csv',index=False)

## Complaints
Filename: ```complaint_cle.csv```

In [48]:
c = pd.read_csv('original_data/complaint_cle.csv', parse_dates=[1])

  data = self._reader.read(nrows)


In [49]:
c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456231 entries, 0 to 456230
Data columns (total 19 columns):
complaint_id           456231 non-null object
c_file_date            456231 non-null datetime64[ns]
c_wf_task              456231 non-null object
c_wf_task_status       423294 non-null object
c_wf_task_date         456230 non-null object
c_type_of_complaint    415198 non-null object
cdc_ri                 98 non-null float64
cdc_corrected          205 non-null float64
cdc_referred           599 non-null float64
othercomp              316292 non-null float64
parcel                 408386 non-null object
parcel1                108389 non-null object
cdc_comp               0 non-null float64
cdccia_comp            0 non-null float64
council_comp           0 non-null float64
mac_comp               0 non-null float64
public_comp            0 non-null float64
other_comp             0 non-null float64
comp_source            368599 non-null object
dtypes: datetime64[ns](1), float64(10

In [50]:
c = c[c.parcel.isin(ppns)]
c = c[(c.c_file_date < np.datetime64('2014-03-01')) & (c.c_file_date > np.datetime64('2006-03-01'))]

In [51]:
c.to_csv('clean_data/complaints_tci.csv')

## Postal data
Filenames: ```pv201302.csv, pv201304.csv, pv201308.csv, pv201312.csv, pv201402.csv```

In [69]:
pos1 = pd.read_csv('original_data/pv201302.csv')
pos1['date'] = np.datetime64('2013-02-01')
pos2 = pd.read_csv('original_data/pv201304.csv')
pos2['date'] = np.datetime64('2013-04-01')
pos3 = pd.read_csv('original_data/pv201308.csv')
pos3['date'] = np.datetime64('2013-08-01')
pos4 = pd.read_csv('original_data/pv201312.csv')
pos4['date'] = np.datetime64('2013-12-01')
pos5 = pd.read_csv('original_data/pv201402.csv')
pos5['date'] = np.datetime64('2014-02-01')

In [70]:
p = pos1.append(pos2).append(pos3).append(pos4).append(pos5)
p = p[p.PARCEL.isin(ppns)]
p.to_csv('clean_data/postal_vacancy_tci.csv')

In [15]:
print fc.shape
print sa.shape
print tf.shape
print al.shape


(123135, 26)
(113807, 25)
(26571, 60)
(184772, 86)


In [20]:
print len(set(fc[fc.parcel.notnull()].parcel))
print len(set(sa[sa.parcel.notnull()].parcel))
print len(set(tf[tf.PROPERTY_NUMBER.notnull()].PROPERTY_NUMBER))
print len(set(al.PROPERTY_NUMBER))

90786
72475
8870
144944


In [25]:
print len(set(fc[fc.parcel.notnull()].parcel).intersection(al.PROPERTY_NUMBER))

42544


In [19]:
print 

144944


In [23]:
sa.parcel

{1}