In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
tci = pd.read_csv('model_data/tci_2_1.csv')
ppns = set(tci['parcel'])
ppns_num = set(tci['parcel'].apply(lambda x: x.replace('-','')))

## Violations

In [3]:
vs = pd.read_csv('clean_data/violations_tci.csv', parse_dates = [1,4])

In [4]:
vs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17801 entries, 0 to 17800
Data columns (total 18 columns):
VIN                     17801 non-null object
v_file_date             17801 non-null datetime64[ns]
v_wf_task               17801 non-null object
v_wf_task_status        17773 non-null object
v_wf_task_date          17801 non-null datetime64[ns]
v_type_of_violation     16642 non-null object
parcel1                 6211 non-null float64
parcel                  17801 non-null object
cond_gar                2524 non-null float64
cond                    5750 non-null float64
other                   2769 non-null float64
maint                   5112 non-null float64
fire                    286 non-null float64
int_ext                 201 non-null float64
vin_id                  13682 non-null object
violation_issue_date    13682 non-null object
vn_created              13682 non-null float64
vn_source               13682 non-null float64
dtypes: datetime64[ns](2), float64(9), object(7

In [5]:
vs.head()

Unnamed: 0,VIN,v_file_date,v_wf_task,v_wf_task_status,v_wf_task_date,v_type_of_violation,parcel1,parcel,cond_gar,cond,other,maint,fire,int_ext,vin_id,violation_issue_date,vn_created,vn_source
0,V06037579,2006-03-02,Closure,Case Closed,2009-10-21,Exterior Maintenance,,128-25-049,,,,1,,,V06037579,03/01/2006,1,2
1,V06037579,2006-03-02,Inspection,Violation Resolved,2009-10-21,Exterior Maintenance,,128-25-049,,,,1,,,V06037579,03/01/2006,1,2
2,V06037586,2006-03-02,Closure,Case Closed,2009-10-21,Exterior Maintenance,,128-25-089,,,,1,,,V06037586,03/01/2006,1,2
3,V06037586,2006-03-02,Inspection,Violation Resolved,2009-10-21,Exterior Maintenance,,128-25-089,,,,1,,,V06037586,03/01/2006,1,2
4,V06037588,2006-03-02,Inspection,Violation Resolved,2009-10-21,Exterior Maintenance,,128-25-054,,,,1,,,V06037588,03/01/2006,1,2


### Violation Task

In [6]:
vs_task = pd.DataFrame(vs.groupby('v_wf_task').count()['VIN'])
vs_task.columns = ['violations_tasks']
vs_task[vs_task.violations_tasks > 1]

Unnamed: 0_level_0,violations_tasks
v_wf_task,Unnamed: 1_level_1
Application Acceptance,5074
Closure,3870
Condemnation,194
Inspection,8268
Non-Condemnation,214
Prosecution,42
Search Warrant,138


In [7]:
for task in set(vs.v_wf_task):
    a = pd.merge(tci, vs.loc[vs.v_wf_task==task,['parcel']].groupby('parcel').first(), how='inner', left_on = 'parcel', right_index=True)
    print task, len(a), sum(a.vacant)*1.0/len(a.vacant)

Closure 2227 0.197126178716
Demolition Approval 1 1.0
Application Acceptance 2805 0.28734402852
Condemnation 148 0.540540540541
Prosecution 28 0.5
Search Warrant 120 0.416666666667
Inspection 2745 0.289981785064
Non-Condemnation 158 0.196202531646


In [8]:
def check_task(x):
    if x in parcel_set:
        return 1
    else:
        return 0
    
for task in vs_task.index:
    parcel_set = set(vs.loc[vs.v_wf_task==task,'parcel'])
    tci['task_'+task[0:3].lower()] = tci['parcel'].apply(check_task)

### Violations count 

In [9]:
tci.columns

Index([u'parcel', u'House Number', u'Street Name', u'Category', u'Survey Date', u'vacant', u'zip', u'propsize', u'pclass', u'totusabl', u'tmktval', u'condition', u'condition_value', u'style_filtered', u'ownerocc_value', u'totbldgs', u'yrbuilt_filtered', u'rextwall', u'num_of_sales', u'quit_claim_deed', u'warranty_deed', u'limited_warranty', u'survivorship_deed', u'fiduciary_deed', u'mult_name_flag', u'SALE_VALID', u'county_land_bank', u'vindall_Y', u'vindall_P', u'task_app', u'task_clo', u'task_con', u'task_dem', u'task_ins', u'task_non', u'task_pro', u'task_sea'], dtype='object')

In [10]:
vs_count = pd.DataFrame(vs.groupby('parcel').count()['VIN'])
vs_count.columns = ['violations_number']
tci = pd.merge(tci, vs_count, \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci.violations_number.isnull(), 'violations_number'] = 0

In [11]:
tci[['violations_number','vacant']].groupby('violations_number').agg([sum, len])

Unnamed: 0_level_0,vacant,vacant
Unnamed: 0_level_1,sum,len
violations_number,Unnamed: 1_level_2,Unnamed: 2_level_2
0,720,10459
1,19,147
2,117,520
3,122,696
4,70,295
5,74,215
6,73,269
7,71,136
8,50,124
9,35,105


In [12]:
sum(tci.loc[tci['violations_number']>10,'vacant'])*1.0/len(tci.loc[tci['violations_number']>10,'vacant'])

0.36091954022988504

### Violation types

In [13]:
vs_type = pd.DataFrame(vs.groupby('v_type_of_violation').count()['VIN'])
vs_type.columns = ['violations_type']

In [14]:
vs_type.sort(columns='violations_type', ascending=False)

Unnamed: 0_level_0,violations_type
v_type_of_violation,Unnamed: 1_level_1
Condemnation - Main Structure,4858
Exterior Maintenance,4490
Condemnation - Garage,2143
Survey,1793
Interior/Exterior Maintenance,488
30 Day Condemnation,293
Fire Damage,271
30 Day Condemnation MS,267
30 Day Condemnation Garage,225
No Permit,166


In [15]:
vs_types = vs_type[vs_type['violations_type']>200].index

In [16]:
vs_types

Index([u'30 Day Condemnation', u'30 Day Condemnation Garage', u'30 Day Condemnation MS', u'Condemnation - Garage', u'Condemnation - Main Structure', u'Exterior Maintenance', u'Fire Damage', u'Interior/Exterior Maintenance', u'Survey'], dtype='object')

In [17]:
def check_vio(x):
    if x in parcel_set:
        return 1
    else:
        return 0

In [18]:
for vio in vs_types:
    parcel_set = set(vs.loc[vs.v_type_of_violation==vio,'parcel'])
    tci['violation_'+vio.lower()] = tci['parcel'].apply(check_vio)
    a = tci.loc[tci['violation_'+vio.lower()]==1, 'vacant']
    print 'violation_'+vio.lower(), len(a), sum(a)*1.0/len(a)

violation_30 day condemnation 112 0.0714285714286
violation_30 day condemnation garage 87 0.103448275862
violation_30 day condemnation ms 98 0.0408163265306
violation_condemnation - garage 556 0.435251798561
violation_condemnation - main structure 1256 0.438694267516
violation_exterior maintenance 1087 0.199632014719
violation_fire damage 75 0.16
violation_interior/exterior maintenance 135 0.177777777778
violation_survey 472 0.39406779661


## Complaints

In [19]:
cp = pd.read_csv('clean_data/complaints_tci.csv', parse_dates = [2])

In [20]:
cp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35114 entries, 0 to 35113
Data columns (total 20 columns):
Unnamed: 0             35114 non-null int64
complaint_id           35114 non-null object
c_file_date            35114 non-null datetime64[ns]
c_wf_task              35114 non-null object
c_wf_task_status       34177 non-null object
c_wf_task_date         35114 non-null object
c_type_of_complaint    34488 non-null object
cdc_ri                 0 non-null float64
cdc_corrected          0 non-null float64
cdc_referred           34 non-null float64
othercomp              28782 non-null float64
parcel                 35114 non-null object
parcel1                2268 non-null float64
cdc_comp               0 non-null float64
cdccia_comp            0 non-null float64
council_comp           0 non-null float64
mac_comp               0 non-null float64
public_comp            0 non-null float64
other_comp             0 non-null float64
comp_source            34177 non-null object
dtypes: d

In [21]:
cp.head(2)

Unnamed: 0.1,Unnamed: 0,complaint_id,c_file_date,c_wf_task,c_wf_task_status,c_wf_task_date,c_type_of_complaint,cdc_ri,cdc_corrected,cdc_referred,othercomp,parcel,parcel1,cdc_comp,cdccia_comp,council_comp,mac_comp,public_comp,other_comp,comp_source
0,30411,CMP06037851,2006-03-06,Complaint Acceptance,Complaint Application Accepted,03/29/2007,,,,,1,137-01-026,,,,,,,,Public
1,30412,CMP06037851,2006-03-06,Inspection,Inspection Approved,03/29/2007,,,,,1,137-01-026,,,,,,,,Public


In [22]:
cp.groupby('c_type_of_complaint').count()['complaint_id']

c_type_of_complaint
Black Mold                                  45
Collapsing Structure                       678
Complete interior/exterior                4208
Court Ordered Inspection                    12
Daycare                                     12
Debris/Garbage/Junk                         39
Debris/Garbage/Junk (Occupied)             216
Debris/Garbage/Junk (Vacant)               113
Doors/Gate/Locks                           107
Driveway                                    28
Dumping                                      2
Dumping Private (Vacant)                    48
Dumping Public                               4
Electrical                                 891
Elevator                                    19
Fence                                      123
Fire Damage                               1952
Foundation issues (animals entering)        11
Garage/Shed/Outbuildings                   359
General Exterior Maintenance              2279
Graffiti                                

### Complaint type

In [23]:
cp_type = pd.DataFrame(cp.groupby('c_type_of_complaint').count()['complaint_id'])
cp_type.columns = ['complaint_type']

In [25]:
cp_type[cp_type['complaint_type']>500]

Unnamed: 0_level_0,complaint_type
c_type_of_complaint,Unnamed: 1_level_1
Collapsing Structure,678
Complete interior/exterior,4208
Electrical,891
Fire Damage,1952
General Exterior Maintenance,2279
O.V.V.,18819
Plumbing,827


In [27]:
def check_complaint(x):
    if x in parcel_set:
        return 1
    else:
        return 0
    
for complaint in cp_type[cp_type['complaint_type']>500].index:
    parcel_set = set(cp.loc[cp.c_type_of_complaint==complaint,'parcel'])
    tci['complaint_'+complaint.lower()] = tci['parcel'].apply(check_complaint)
    a = tci.loc[tci['complaint_'+complaint.lower()]==1, 'vacant']
    print 'complaint_'+complaint.lower(), len(a), sum(a)*1.0/len(a)

complaint_collapsing structure 142 0.267605633803
complaint_complete interior/exterior 862 0.359628770302
complaint_electrical 182 0.214285714286
complaint_fire damage 464 0.185344827586
complaint_general exterior maintenance 487 0.221765913758
complaint_o.v.v. 2393 0.374843292938
complaint_plumbing 173 0.196531791908


### Complaint count (all)

In [28]:
cp_count = pd.DataFrame(cp.groupby('parcel').count()['complaint_id'])
cp_count.columns = ['complaint_count']
tci = pd.merge(tci, cp_count, \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci.complaint_count.isnull(), 'complaint_count'] = 0

In [30]:
sum(tci.loc[tci.complaint_count>10, 'vacant'])*1.0/len(tci.loc[tci.complaint_count>10, 'vacant'])

0.44293478260869568

### Complaint count (6 months)

In [31]:
cp_count = pd.DataFrame(cp[cp.c_file_date>dt.datetime(2013, 9, 1)].groupby('parcel').count()['complaint_id'])
cp_count.columns = ['complaint_count_6_mon']
tci = pd.merge(tci, cp_count, \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci.complaint_count_6_mon.isnull(), 'complaint_count_6_mon'] = 0

In [32]:
sum(tci.loc[tci.complaint_count_6_mon>2, 'vacant'])*1.0/len(tci.loc[tci.complaint_count_6_mon>2, 'vacant'])

0.53640776699029125

### Complain count (3 months)

In [33]:
cp_count = pd.DataFrame(cp[cp.c_file_date>dt.datetime(2013, 12, 1)].groupby('parcel').count()['complaint_id'])
cp_count.columns = ['complaint_count_3_mon']
tci = pd.merge(tci, cp_count, \
               how='left', left_on='parcel', right_index=True)
tci.loc[tci.complaint_count_3_mon.isnull(), 'complaint_count_3_mon'] = 0

In [34]:
sum(tci.loc[tci.complaint_count_3_mon>0, 'vacant'])*1.0/len(tci.loc[tci.complaint_count_3_mon>0, 'vacant']) ,len(tci.loc[tci.complaint_count_3_mon>0, 'vacant'])

(0.58585858585858586, 198)

In [98]:
tci.to_csv('model_data/tci_2_2.csv', index=False)

In [99]:
sum(tci.vacant)

1555