In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from scipy import stats
from scipy import sparse

import random
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

np.random.seed(20170228)

In [2]:
# load data
#data = pd.read_csv('./processed_data/pluto_fdny_dob_census_to_zipcode_2013-14.csv', index_col = False)
data = pd.read_csv('./processed_data/pluto_dob_census_to_tract_2013-14.csv', index_col=False)

In [3]:
tract_inc = pd.read_csv('./processed_data/tract_incidents_2013.csv', index_col=False)
tract_inc = tract_inc.merge(pd.read_csv('./processed_data/tract_incidents_2014.csv', index_col=False), on="GEOID")
tract_inc['tgi']=tract_inc.total_gas_incidents_yr_x + tract_inc.total_gas_incidents_yr_y

In [4]:
tract_inc = tract_inc.drop(['total_gas_incidents_yr_'+a for a in ['x','y']], axis=1)

In [5]:
data = data.merge(tract_inc, left_on='TRACT', right_on='GEOID').drop('GEOID_y', axis=1)

In [6]:
pd.options.display.max_seq_items=999
data.columns

Index([u'TRACT', u'age', u'bldg_class_A0', u'bldg_class_A1', u'bldg_class_A2',
       u'bldg_class_A3', u'bldg_class_A4', u'bldg_class_A5', u'bldg_class_A6',
       u'bldg_class_A7', u'bldg_class_A8', u'bldg_class_A9', u'bldg_class_B1',
       u'bldg_class_B2', u'bldg_class_B3', u'bldg_class_B9', u'bldg_class_C0',
       u'bldg_class_C1', u'bldg_class_C2', u'bldg_class_C3', u'bldg_class_C4',
       u'bldg_class_C5', u'bldg_class_C6', u'bldg_class_C7', u'bldg_class_C8',
       u'bldg_class_C9', u'bldg_class_D0', u'bldg_class_D1', u'bldg_class_D2',
       u'bldg_class_D3', u'bldg_class_D4', u'bldg_class_D5', u'bldg_class_D6',
       u'bldg_class_D7', u'bldg_class_D8', u'bldg_class_D9', u'bldg_class_E1',
       u'bldg_class_E3', u'bldg_class_E4', u'bldg_class_E7', u'bldg_class_E9',
       u'bldg_class_F1', u'bldg_class_F2', u'bldg_class_F4', u'bldg_class_F5',
       u'bldg_class_F8', u'bldg_class_F9', u'bldg_class_G0', u'bldg_class_G1',
       u'bldg_class_G2', u'bldg_class_G3', u'bldg_cl

In [7]:
data.shape

(3174, 768)

In [8]:
ind = stats.bernoulli.rvs(p = 0.75, size = len(data.index))
train=data[ind==1]
test=data[ind==0]

In [9]:
print train.shape, test.shape

(2393, 768) (781, 768)


In [10]:
excludefromfeatures=[
    'TRACT',
    'NTACode',
    'NTAName',
    'geometry',
    'ZipCode',
    'tgi'
]

In [11]:
train

Unnamed: 0,TRACT,age,bldg_class_A0,bldg_class_A1,bldg_class_A2,bldg_class_A3,bldg_class_A4,bldg_class_A5,bldg_class_A6,bldg_class_A7,...,BLACK_AFRICAN_AMERICAN_ratio,AMERICAN_INDIAN_AND_ALASKA_NATIVE_ratio,ASIAN_ratio,NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER_ratio,SOME_OTHER_RACE_ratio,NTACode,NTAName,geometry,ZipCode,tgi
0,36005000100,80.000000,,,,,,,,,...,0.629106,0.001688,0.015838,0.000000,0.240815,BX98,Rikers Island,"POLYGON ((1019454.697021484 225654.3287963867,...",,
1,36005000200,61.691124,,0.142544,0.149123,,,0.017544,,,...,0.208588,0.001666,0.041829,0.000000,0.276883,BX09,Soundview-Castle Hill-Clason Point-Harding Park,"POLYGON ((1023972.528015137 232680.6583862305,...",10473.0,26.166667
2,36005000400,53.657450,,0.076537,0.095358,,,0.022585,0.006274,,...,0.357058,0.002367,0.006424,0.000000,0.264920,BX09,Soundview-Castle Hill-Clason Point-Harding Park,"POLYGON ((1026849.274230957 235548.7739868164,...",10473.0,21.233333
3,36005001600,53.886628,,0.104712,0.031414,,,0.083770,,,...,0.374043,0.000000,0.000000,0.000000,0.222317,BX09,Soundview-Castle Hill-Clason Point-Harding Park,"POLYGON ((1024344.111633301 238946.0208129883,...",10473.0,32.597835
5,36005001900,81.806630,,,,,,0.041943,,,...,0.343497,0.000000,0.021227,0.000000,0.395214,BX39,Mott Haven-Port Morris,MULTIPOLYGON (((1012821.805786133 229228.26458...,10455.0,55.738564
8,36005002400,,,,,,,,,,...,0.480000,0.000000,0.000000,0.000000,0.000000,BX99,park-cemetery-etc-Bronx,"POLYGON ((1020092.676635742 238184.5195922852,...",10473.0,43.124026
9,36005002500,79.629371,,,,,,0.084848,,,...,0.220728,0.000000,0.039963,0.000000,0.482913,BX39,Mott Haven-Port Morris,"POLYGON ((1007094.576599121 233234.8546142578,...",10454.0,27.909127
10,36005002701,84.800000,,,,,,,,,...,0.375332,0.000000,0.000000,0.000000,0.435013,BX39,Mott Haven-Port Morris,"POLYGON ((1008189.859802246 232299.325012207, ...",10454.0,22.734127
11,36005002702,50.185430,,,,,,0.130435,,,...,0.321264,0.010674,0.000000,0.000000,0.477815,BX39,Mott Haven-Port Morris,"POLYGON ((1009410.756225586 232993.1588134766,...",10454.0,28.150794
12,36005002800,65.204819,,,,,,0.071429,,,...,0.763918,0.000000,0.017362,0.000000,0.096056,BX09,Soundview-Castle Hill-Clason Point-Harding Park,"POLYGON ((1017193.342407227 237404.8302001953,...",10473.0,48.300216


In [12]:
# split training data into features and outcome (numpy arrays, to feed to sklearn algorithms)
label_train = np.ravel(train.tgi.fillna(0).values)
pred_train = train.drop(excludefromfeatures, axis=1).fillna(0).replace(np.inf, 0)

# print pred_train.head()
pred_train = pred_train.values 

min_max_scaler = preprocessing.MinMaxScaler()
pred_train = min_max_scaler.fit_transform(pred_train)
pred_train


array([[ 0.65303011,  0.        ,  0.        , ...,  0.01764281,
         0.        ,  0.34720283],
       [ 0.50357702,  0.        ,  0.17308897, ...,  0.0465952 ,
         0.        ,  0.39920491],
       [ 0.43799913,  0.        ,  0.0929378 , ...,  0.00715643,
         0.        ,  0.38195614],
       ..., 
       [ 0.26347274,  0.        ,  0.0262706 , ...,  0.15949128,
         0.        ,  0.11595785],
       [ 0.33866102,  0.        ,  0.14366197, ...,  0.07033207,
         0.        ,  0.12495434],
       [ 0.44545011,  0.        ,  0.20103175, ...,  0.01014663,
         0.        ,  0.05565756]])

In [13]:
print label_train.shape, pred_train.shape

(2393,) (2393, 762)


In [14]:
results = test.copy()
# format test data
label_test = np.ravel(test.tgi.fillna(0).values)
pred_test = train.drop(excludefromfeatures, axis=1).fillna(0).replace(np.inf, 0)

# print pred_train.head()
pred_test = pred_test.values 

min_max_scaler = preprocessing.MinMaxScaler()
pred_test = min_max_scaler.fit_transform(pred_test)
pred_test

#label_test = np.ravel('gas_incidents_per_bldg_unit'.values)
#pred_test = test.drop(excludefromfeatures, axis=1)
#feature_names = list(pred_test.columns.values)
#pred_test = pred_test.values 

#min_max_scaler = preprocessing.MinMaxScaler()
#pred_test = min_max_scaler.fit_transform(pred_test)
#pred_test

array([[ 0.65303011,  0.        ,  0.        , ...,  0.01764281,
         0.        ,  0.34720283],
       [ 0.50357702,  0.        ,  0.17308897, ...,  0.0465952 ,
         0.        ,  0.39920491],
       [ 0.43799913,  0.        ,  0.0929378 , ...,  0.00715643,
         0.        ,  0.38195614],
       ..., 
       [ 0.26347274,  0.        ,  0.0262706 , ...,  0.15949128,
         0.        ,  0.11595785],
       [ 0.33866102,  0.        ,  0.14366197, ...,  0.07033207,
         0.        ,  0.12495434],
       [ 0.44545011,  0.        ,  0.20103175, ...,  0.01014663,
         0.        ,  0.05565756]])

In [15]:
# train the model with 1000 trees, 4 parallel processes, and 10 min samples to split a node 
num_trees = 5000
rf = RandomForestRegressor(n_estimators=num_trees, n_jobs=8, min_samples_split=10, verbose=1, oob_score = True)
rf.fit(X=pred_train, y=label_train)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    7.7s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   17.8s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   32.1s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:   50.2s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:  3.4min
[Parallel(n_jobs=8)]: Done 5000 out of 5000 | elapsed:  3.4min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           n_estimators=5000, n_jobs=8, oob_score=True, random_state=None,
           verbose=1, warm_start=False)

In [16]:
# generate predictions and add them to 'results'
rf_predictions = rf.predict(pred_test)
rf_predictions_tr = rf.predict(pred_train)
#print rf_predictions
#results['preds'] = rf_predictions

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    2.2s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    3.5s
[Parallel(n_jobs=8)]: Done 5000 out of 5000 | elapsed:    3.5s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.9s
[Paral

In [17]:
print sorted(zip(rf.feature_importances_, train.drop(excludefromfeatures, axis=1).columns), reverse=True)[:20]

[(0.090863624860127334, 'landuse_01'), (0.083386584342683048, 'ECB_violation_Cranes and Derricks'), (0.071934835565003977, 'DOB_complaint_67'), (0.048859804484697185, 'ECB_infraction_141'), (0.045576705912893527, 'BLACK_AFRICAN_AMERICAN_ratio'), (0.04278281162841184, 'DOB_dispos_L1'), (0.040581475698908215, 'DOB_dispos_L2'), (0.034302088279468325, 'total_units'), (0.02801214495537091, 'bldg_class_C4'), (0.027217993260471429, 'DOB_complaint_10'), (0.027039345599339435, 'bldg_class_V1'), (0.024733948581634125, 'TOTAL_POPULATION'), (0.024333413718119493, 'TOTAL_HOUSEHOLDS'), (0.020060111560997438, 'landuse_11'), (0.019608624770552904, 'ECB_infraction_1C4'), (0.017164764329343309, 'GEOID_x'), (0.0099926969776130761, 'age'), (0.0095884546680601434, 'DOB_permit_AL'), (0.0068182815621472556, 'DOB_dispos_B2'), (0.0065927682783064653, 'MEAN_INCOME')]


In [18]:
rf_predictions = rf.predict(pred_test)
rf_predictions_tr = rf.predict(pred_train)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    3.5s
[Parallel(n_jobs=8)]: Done 5000 out of 5000 | elapsed:    3.5s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.9s
[Paral