In [19]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler # Look at RF for package
from sklearn.decomposition import PCA
import numpy as np
np.random.seed(20170301)

# Zipcode train, test, predict data

In [20]:
# get 2013, 2014, and 2015 data for zipcode

init_zip_2013 = pd.read_csv('outputs/pluto_fdny_dob_census_to_zipcode_2013.csv')
init_zip_2014 = pd.read_csv('outputs/pluto_fdny_dob_census_to_zipcode_2014.csv')
init_zip_2015 = pd.read_csv('outputs/pluto_fdny_dob_census_to_zipcode_2015.csv')

### ^^^ DJC NOTE: I've changed the name of the initialized dataframes here and created copies below keep from over-writing them when you drop the columns, like "geometry" and later "zipcode" from the features and target matrices. Now you can easily merge your predicted gas leaks per zip back to this initial df that has zip code number and geometry for mapping, etc. (and the index is unchanged, so rows should be in exact same order).

In [21]:
# create copies of initial dfs for further manipulation
zip_2013 = init_zip_2013.copy()
zip_2014 = init_zip_2014.copy()
zip_2015 = init_zip_2015.copy()


# Cleaning non-numeric columns
# remove nan's and inf's (turn to 0)

zip_2013.fillna(0, inplace=True)
zip_2013 = zip_2013.replace(np.inf, 0)
zip_2013 = zip_2013[~zip_2013['ZipCode'].isin(['0', 0])]    
zip_2014.fillna(0, inplace=True)
zip_2014 = zip_2014.replace(np.inf, 0)
zip_2014 = zip_2014[~zip_2014['ZipCode'].isin(['0', 0])]
zip_2015.fillna(0, inplace=True)
zip_2015 = zip_2015.replace(np.inf, 0)
zip_2015 = zip_2015[~zip_2015['ZipCode'].isin(['0', 0])]
for i in ['geometry', 'AREA', 'total_gas_incidents']:
    del zip_2013[i]
    del zip_2014[i]
    del zip_2015[i]
   

In [22]:
# processing columns to be in the same order. 
# if 2014 does not have a column from 2013, 
# 0's will be filled for the entire column 

zip_cols_2013 = zip_2013.columns.tolist()
for i in zip_cols_2013:
    if i not in zip_2014.columns:
        zip_2014[i] = 0.0
        
# place 2014 columns in the same order - droppping cols that did not appear in 2013.
zip_2014 = zip_2014[zip_cols_2013]

In [23]:
# validation that zip code orders are the same for our train and test set

for idx, i in enumerate(zip_2013.iloc[:,0].values):
    if zip_2014.iloc[:,0].values[idx] != i:
        print i

for idx, i in enumerate(zip_2014.iloc[:,0].values):
    if zip_2015.iloc[:,0].values[idx] != i:
        print i

In [24]:
# X_train will be 2013 features, y_train will be 2013 gas_leaks_per_bldg_unit
X_train_zip = zip_2013.iloc[:,1:-1].values
y_train_zip = zip_2013.iloc[:,-1].values


# min/max scalling of feature data
min_max_scaler = MinMaxScaler()
X_train_zip = min_max_scaler.fit_transform(X_train_zip)

# X_test will be 2013 features, y_test will be 2014 gas_leaks_per_bldg_unit
X_test_zip = X_train_zip
y_test_zip = zip_2014.iloc[:,-1].values

### DJC NOTE: Someone should actually try doing some Bayesian inference here -- and instead of fitting the X_pred_zip (i.e. 2014) features with MinMaxScaler, actually use parameters learned from the 2013 training set (i.e. prior distribution) and model the change in distribution learned from the 2014 data when cross-validating....

In [25]:
# create prediction features and dependent variable - zip

X_pred_zip = zip_2014.iloc[:,1:-1].values

min_max_scaler = MinMaxScaler()
X_pred_zip = min_max_scaler.fit_transform(X_pred_zip)

y_pred_zip = zip_2015.iloc[:,-1].values


In [26]:
print X_train_zip.shape, X_test_zip.shape, X_pred_zip.shape

(194, 720) (194, 720) (194, 720)


In [27]:
print y_train_zip.shape, y_test_zip.shape, y_pred_zip.shape

(194,) (194,) (194,)


In [28]:
# naive model 2013 to predict 2014

# y_test_zip - y_train_zip # calculate error

# Tract train, test, predict data

In [29]:
# get 2013, 2014, and 2015 data for zipcode

init_tract_2013 = pd.read_csv('outputs/pluto_fdny_dob_census_to_tract_2013.csv')
init_tract_2014 = pd.read_csv('outputs/pluto_fdny_dob_census_to_tract_2014.csv')
init_tract_2015 = pd.read_csv('outputs/pluto_fdny_dob_census_to_tract_2015.csv')

### ^^^ DJC NOTE #1: As before, I've changed the name of the initialized dataframes here and created copies below.

### NOTE #2: While before, zip codes were really the index for the observations (and Nate dropped them from the features appropriately), for the tract-level, they actually serve as categorical features that are a rough approximation of geographic proximity and likely have some influence in terms of predicting gas leaks. Not likely better than spatial autocorrelation, but food for thought in case someone tries including zips as features (you'd have to convert to one-hot vector)...

In [30]:
# copy initial dfs
tract_2013 = init_tract_2013.copy()
tract_2014 = init_tract_2014.copy()
tract_2015 = init_tract_2015.copy()


# Cleaning non-numeric columns
# remove nan's and inf's (turn to 0)

tract_2013.fillna(0, inplace=True)
tract_2013 = tract_2013.replace(np.inf, 0)
tract_2014.fillna(0, inplace=True)
tract_2014 = tract_2014.replace(np.inf, 0)
tract_2015.fillna(0, inplace=True)
tract_2015 = tract_2015.replace(np.inf, 0)
for i in ['NTACode', 'NTAName', 'geometry', 'ZipCode', 'total_gas_incidents', 'GEOID']:
    del tract_2013[i]
    del tract_2014[i]
    del tract_2015[i]

In [31]:
# processing columns to be in the same order. 
# if 2014 does not have a column from 2013, 
# 0's will be filled for the entire column 

tract_cols_2013 = tract_2013.columns.tolist()
for i in tract_cols_2013:
    if i not in tract_2014.columns:
        tract_2014[i] = 0.0
        
# place 2014 columns in the same order - droppping cols that did not appear in 2013.
tract_2014 = tract_2014[tract_cols_2013]

In [32]:
# validation that tract orders are the same for our train, test, and predict sets

for idx, i in enumerate(tract_2013.iloc[:,0].values):
    if tract_2014.iloc[:,0].values[idx] != i:
        print i

for idx, i in enumerate(tract_2014.iloc[:,0].values):
    if tract_2015.iloc[:,0].values[idx] != i:
        print i

In [33]:
# X_train will be 2013 features, y_train will be 2013 gas_leaks_per_bldg_unit
X_train_tract = tract_2013.iloc[:,1:-1].values
y_train_tract = tract_2013.iloc[:,-1].values


# min/max scalling of feature data
min_max_scaler = MinMaxScaler()
X_train_tract = min_max_scaler.fit_transform(X_train_tract)

# X_test will be 2013 features, y_test will be 2014 gas_leaks_per_bldg_unit
X_test_tract = X_train_tract
y_test_tract = tract_2014.iloc[:,-1].values

In [34]:
# create prediction features and dependent variable - tract

X_pred_tract = tract_2014.iloc[:,1:-1].values

# scaling of features 
min_max_scaler = MinMaxScaler()
X_pred_tract = min_max_scaler.fit_transform(X_pred_tract)

y_pred_tract = tract_2015.iloc[:,-1].values

In [35]:
print X_pred_tract.shape, X_train_tract.shape, X_test_tract.shape

(3180, 717) (3180, 717) (3180, 717)


In [36]:
print y_train_tract.shape, y_test_tract.shape, y_pred_tract.shape

(3180,) (3180,) (3180,)
