In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, RandomizedLasso
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler # Look at RF for package
from sklearn.decomposition import PCA
from scipy.stats import uniform as sp_rand
from minepy import MINE
import numpy as np


np.random.seed(20170301)

In [22]:
def make_model(reg):
    LIN_MODEL = 'gas_incidents_per_bldg_unit = '
    if reg.intercept_ > 0:
        LIN_MODEL += str(round(reg.intercept_, 5)) + ' '
    else:
        LIN_MODEL += str(round(reg.intercept_, 5)) + ' '
    for idx, i in enumerate(list(reg.coef_)):
        if i > 0:
            LIN_MODEL = LIN_MODEL + '+ ' + str(round(i, 5)) + "*" + select_cols[idx] + ' '
        else:
            LIN_MODEL = LIN_MODEL + '- ' + str(round(i, 5))[1:] + "*" + select_cols[idx] + ' '
    
    return LIN_MODEL

# Zipcode train, test, predict data

In [2]:
# get 2013, 2014, and 2015 data for zipcode

zip_2013 = pd.read_csv('outputs/pluto_fdny_dob_census_to_zipcode_2013.csv')
zip_2014 = pd.read_csv('outputs/pluto_fdny_dob_census_to_zipcode_2014.csv')
zip_2015 = pd.read_csv('outputs/pluto_fdny_dob_census_to_zipcode_2015.csv')

In [3]:
# Cleaning non-numeric columns
# remove nan's and inf's (turn to 0)

zip_2013.fillna(0, inplace=True)
zip_2013 = zip_2013.replace(np.inf, 0)
zip_2013 = zip_2013[~zip_2013['ZipCode'].isin(['0', 0])]    
zip_2014.fillna(0, inplace=True)
zip_2014 = zip_2014.replace(np.inf, 0)
zip_2014 = zip_2014[~zip_2014['ZipCode'].isin(['0', 0])]
zip_2015.fillna(0, inplace=True)
zip_2015 = zip_2015.replace(np.inf, 0)
zip_2015 = zip_2015[~zip_2015['ZipCode'].isin(['0', 0])]
for i in ['geometry', 'AREA', 'total_gas_incidents']:
    del zip_2013[i]
    del zip_2014[i]
    del zip_2015[i]
   

In [4]:
# processing columns to be in the same order. 
# if 2014 does not have a column from 2013, 
# 0's will be filled for the entire column 

zip_cols_2013 = zip_2013.columns.tolist()
for i in zip_cols_2013:
    if i not in zip_2014.columns:
        zip_2014[i] = 0.0
        
# place 2014 columns in the same order - droppping cols that did not appear in 2013.
zip_2014 = zip_2014[zip_cols_2013]

In [5]:
# validation that zip code orders are the same for our train and test set

for idx, i in enumerate(zip_2013.iloc[:,0].values):
    if zip_2014.iloc[:,0].values[idx] != i:
        print i

for idx, i in enumerate(zip_2014.iloc[:,0].values):
    if zip_2015.iloc[:,0].values[idx] != i:
        print i

In [6]:
# X_train will be 2013 features, y_train will be 2013 gas_leaks_per_bldg_unit
X_train_zip = zip_2013.iloc[:,1:-1].values
y_train_zip = zip_2013.iloc[:,-1].values


# min/max scalling of feature data
min_max_scaler = MinMaxScaler()
X_train_zip = min_max_scaler.fit_transform(X_train_zip)

# X_test will be 2013 features, y_test will be 2014 gas_leaks_per_bldg_unit
X_test_zip = X_train_zip
y_test_zip = zip_2014.iloc[:,-1].values

In [7]:
# create prediction features and dependent variable - zip

X_pred_zip = zip_2014.iloc[:,1:-1].values

min_max_scaler = MinMaxScaler()
X_pred_zip = min_max_scaler.fit_transform(X_pred_zip)

y_pred_zip = zip_2015.iloc[:,-1].values


In [8]:
print X_train_zip.shape, X_test_zip.shape, X_pred_zip.shape

(194, 720) (194, 720) (194, 720)


In [9]:
print y_train_zip.shape, y_test_zip.shape, y_pred_zip.shape

(194,) (194,) (194,)


In [27]:
names = zip_cols_2013[1:-1]

X_train_tract = X_train_zip
y_train_tract = y_train_zip
 
ranks = {}
 
def rank_to_dict(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

lr = LinearRegression(normalize=True)
lr.fit(X_train_tract, y_train_tract)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

param_grid = {'alpha': sp_rand()}
ridge = Ridge()
ridge = RandomizedSearchCV(estimator=ridge, param_distributions=param_grid, n_iter=100)
ridge.fit(X_train_tract, y_train_tract)
# summarize the results of the random parameter search
ridge_alpha = ridge.best_estimator_.alpha
 
ridge = Ridge(alpha=ridge_alpha)
ridge.fit(X_train_tract, y_train_tract)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

# prepare a uniform distribution to sample for the alpha parameter

# create and fit a ridge regression model, testing random alpha values
lasso = Lasso()
lasso = RandomizedSearchCV(estimator=lasso, param_distributions=param_grid, n_iter=100)
lasso.fit(X_train_tract, y_train_tract)
# summarize the results of the random parameter search
lasso_alpha = lasso.best_estimator_.alpha

# Both lasso and random lasso returned 0's for each feature

#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X_train_tract, y_train_tract)
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

rf = RandomForestRegressor()
rf.fit(X_train_tract, y_train_tract)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

f, pval  = f_regression(X_train_tract, y_train_tract, center=True)
ranks["Corr."] = rank_to_dict(f, names)
 

mine = MINE()
mic_scores = []
for i in range(X_train_tract.shape[1]):
    mine.compute_score(X_train_tract[:,i], y_train_tract)
    m = mine.mic()
    mic_scores.append(m)
    
ranks["MIC"] = rank_to_dict(mic_scores, names) 

r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)
    
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")

zip_fs = pd.DataFrame.from_dict(ranks)

all_zip = {}

# loop through top variables from each technique above (top 3 through top 10)       
for num in range(1, 11):
    print num

    # run rige lasso and linear regression on tome 
    for i in zip_fs.columns:
        
        select_cols = zip_fs.sort_values(by=i, ascending=False).index[:num].values
        X = pd.DataFrame(X_test_zip, columns=zip_cols_2013[1:-1])
        X_pred = pd.DataFrame(X_pred_zip, columns=zip_cols_2013[1:-1])
        
        X_train = X[select_cols].values
        X_test = X_train
        X_predict = X_pred[select_cols].values
        
        y_train = y_train_zip
        y_test = y_test_zip
        y_predict = y_pred_zip
        
        # prepare a uniform distribution to sample for the alpha parameter
        param_grid = {'alpha': sp_rand()}
        # create and fit a ridge regression model, testing random alpha values
        model = Lasso()
        laregr_temp = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
        laregr_temp.fit(X_train, y_train)
        # summarize the results of the random parameter search
        LASSO_ALPHA = laregr_temp.best_estimator_.alpha
        model2 = Ridge()
        riregr_temp = RandomizedSearchCV(estimator=model2, param_distributions=param_grid, n_iter=100)
        riregr_temp.fit(X_train, y_train)
        # summarize the results of the random parameter search
        RIDGE_ALPHA = riregr_temp.best_estimator_.alpha

        liregr = LinearRegression()
        liregr.fit(X_train, y_train)        

        LIN_MODEL = make_model(liregr)
        LIN_RMSE_IS = np.sqrt(np.mean((liregr.predict(X_train) - y_train) ** 2))
        LIN_RMSE_OS = np.sqrt(np.mean((liregr.predict(X_test) - y_test) ** 2))
        LIN_RMSE_PS = np.sqrt(np.mean((liregr.predict(X_predict) - y_predict) ** 2))

        laregr = Lasso(alpha=LASSO_ALPHA)
        laregr.fit(X_train, y_train)
        
        LAS_RMSE_IS = np.sqrt(np.mean((laregr.predict(X_train) - y_train) ** 2))
        LAS_RMSE_OS = np.sqrt(np.mean((laregr.predict(X_test) - y_test) ** 2))
        LAS_RMSE_PS = np.sqrt(np.mean((laregr.predict(X_predict) - y_predict) ** 2))
        
        riregr = Ridge(alpha=RIDGE_ALPHA)
        riregr.fit(X_train, y_train)

        RID_RMSE_IS = np.sqrt(np.mean((riregr.predict(X_train) - y_train) ** 2))
        RID_RMSE_OS = np.sqrt(np.mean((riregr.predict(X_test) - y_test) ** 2))
        RID_RMSE_PS = np.sqrt(np.mean((riregr.predict(X_predict) - y_predict) ** 2))

        all_zip[str(num)+'_'+i] = [list(select_cols), LIN_MODEL, LIN_RMSE_IS, LIN_RMSE_OS, LIN_RMSE_PS, 
                                   LASSO_ALPHA, LAS_RMSE_IS, LAS_RMSE_OS, LAS_RMSE_PS,
                                   RIDGE_ALPHA, RID_RMSE_IS, RID_RMSE_OS, RID_RMSE_PS]


In [30]:
# values in non alpha columns represent root mean square error


rmse_zip = pd.DataFrame.from_dict(all_zip).T.reset_index()

rmse_zip.columns = ['#_FS_Technique', 'features', 
                    'LinModel', 'LinReg2013', 'LinReg2014', 'LinReg2015',
                    'LasAlpha', 'LasReg2013', 'LasReg2014', 'LasReg2015',
                    'RidAlpha', 'RidReg2013', 'RidReg2014', 'RidReg2015']

rmse_zip.sort_values(by='RidReg2015')

# Tract train, test, predict data

In [11]:
# get 2013, 2014, and 2015 data for zipcode

tract_2013 = pd.read_csv('outputs/pluto_fdny_dob_census_to_tract_2013.csv')
tract_2014 = pd.read_csv('outputs/pluto_fdny_dob_census_to_tract_2014.csv')
tract_2015 = pd.read_csv('outputs/pluto_fdny_dob_census_to_tract_2015.csv')

In [12]:
# Cleaning non-numeric columns
# remove nan's and inf's (turn to 0)

tract_2013.fillna(0, inplace=True)
tract_2013 = tract_2013.replace(np.inf, 0)
tract_2014.fillna(0, inplace=True)
tract_2014 = tract_2014.replace(np.inf, 0)
tract_2015.fillna(0, inplace=True)
tract_2015 = tract_2015.replace(np.inf, 0)
for i in ['NTACode', 'NTAName', 'geometry', 'ZipCode', 'total_gas_incidents', 'GEOID']:
    del tract_2013[i]
    del tract_2014[i]
    del tract_2015[i]

In [13]:
# processing columns to be in the same order. 
# if 2014 does not have a column from 2013, 
# 0's will be filled for the entire column 

tract_cols_2013 = tract_2013.columns.tolist()
for i in tract_cols_2013:
    if i not in tract_2014.columns:
        tract_2014[i] = 0.0
        
# place 2014 columns in the same order - droppping cols that did not appear in 2013.
tract_2014 = tract_2014[tract_cols_2013]

In [14]:
# validation that tract orders are the same for our train, test, and predict sets

for idx, i in enumerate(tract_2013.iloc[:,0].values):
    if tract_2014.iloc[:,0].values[idx] != i:
        print i

for idx, i in enumerate(tract_2014.iloc[:,0].values):
    if tract_2015.iloc[:,0].values[idx] != i:
        print i

In [15]:
# X_train will be 2013 features, y_train will be 2013 gas_leaks_per_bldg_unit
X_train_tract = tract_2013.iloc[:,1:-1].values
y_train_tract = tract_2013.iloc[:,-1].values


# min/max scalling of feature data
min_max_scaler = MinMaxScaler()
X_train_tract = min_max_scaler.fit_transform(X_train_tract)

# X_test will be 2013 features, y_test will be 2014 gas_leaks_per_bldg_unit
X_test_tract = X_train_tract
y_test_tract = tract_2014.iloc[:,-1].values

In [16]:
# create prediction features and dependent variable - tract

X_pred_tract = tract_2014.iloc[:,1:-1].values

# scaling of features 
min_max_scaler = MinMaxScaler()
X_pred_tract = min_max_scaler.fit_transform(X_pred_tract)

y_pred_tract = tract_2015.iloc[:,-1].values

In [17]:
print X_pred_tract.shape, X_train_tract.shape, X_test_tract.shape

(3180, 717) (3180, 717) (3180, 717)


In [18]:
print y_train_tract.shape, y_test_tract.shape, y_pred_tract.shape

(3180,) (3180,) (3180,)


In [20]:


names = tract_cols_2013[1:-1]
 
ranks = {}
 
def rank_to_dict(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

lr = LinearRegression(normalize=True)
lr.fit(X_train_tract, y_train_tract)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

param_grid = {'alpha': sp_rand()}
ridge = Ridge()
ridge = RandomizedSearchCV(estimator=ridge, param_distributions=param_grid, n_iter=100)
ridge.fit(X_train_tract, y_train_tract)
# summarize the results of the random parameter search
ridge_alpha = ridge.best_estimator_.alpha
 
ridge = Ridge(alpha=ridge_alpha)
ridge.fit(X_train_tract, y_train_tract)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

# prepare a uniform distribution to sample for the alpha parameter

# create and fit a ridge regression model, testing random alpha values
lasso = Lasso()
lasso = RandomizedSearchCV(estimator=lasso, param_distributions=param_grid, n_iter=100)
lasso.fit(X_train_tract, y_train_tract)
# summarize the results of the random parameter search
lasso_alpha = lasso.best_estimator_.alpha

# Both lasso and random lasso returned 0's for each feature

#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X_train_tract, y_train_tract)
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

rf = RandomForestRegressor()
rf.fit(X_train_tract, y_train_tract)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

f, pval  = f_regression(X_train_tract, y_train_tract, center=True)
ranks["Corr."] = rank_to_dict(f, names)
 

mine = MINE()
mic_scores = []
for i in range(X_train_tract.shape[1]):
    mine.compute_score(X_train_tract[:,i], y_train_tract)
    m = mine.mic()
    mic_scores.append(m)
    
ranks["MIC"] = rank_to_dict(mic_scores, names) 

r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)
    
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")


In [21]:
tract_fs = pd.DataFrame.from_dict(ranks)

In [23]:
all_tract = {}

# loop through top variables from each technique above (top 3 through top 10)       
for num in range(1, 11):
    print num

    # run rige lasso and linear regression on tome 
    for i in tract_fs.columns:
        
        select_cols = tract_fs.sort_values(by=i, ascending=False).index[:num].values
        X = pd.DataFrame(X_test_tract, columns=tract_cols_2013[1:-1])
        X_pred = pd.DataFrame(X_pred_tract, columns=tract_cols_2013[1:-1])
        
        X_train = X[select_cols].values
        X_test = X_train
        X_predict = X_pred[select_cols].values
        
        y_train = y_train_tract
        y_test = y_test_tract
        y_predict = y_pred_tract
        
        # prepare a uniform distribution to sample for the alpha parameter
        param_grid = {'alpha': sp_rand()}
        # create and fit a ridge regression model, testing random alpha values
        model = Lasso()
        laregr_temp = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
        laregr_temp.fit(X_train, y_train)
        # summarize the results of the random parameter search
        LASSO_ALPHA = laregr_temp.best_estimator_.alpha
        model2 = Ridge()
        riregr_temp = RandomizedSearchCV(estimator=model2, param_distributions=param_grid, n_iter=100)
        riregr_temp.fit(X_train, y_train)
        # summarize the results of the random parameter search
        RIDGE_ALPHA = riregr_temp.best_estimator_.alpha

        liregr = LinearRegression()
        liregr.fit(X_train, y_train)        

        LIN_MODEL = make_model(liregr)
        LIN_RMSE_IS = np.sqrt(np.mean((liregr.predict(X_train) - y_train) ** 2))
        LIN_RMSE_OS = np.sqrt(np.mean((liregr.predict(X_test) - y_test) ** 2))
        LIN_RMSE_PS = np.sqrt(np.mean((liregr.predict(X_predict) - y_predict) ** 2))

        laregr = Lasso(alpha=LASSO_ALPHA)
        laregr.fit(X_train, y_train)
        
        LAS_RMSE_IS = np.sqrt(np.mean((laregr.predict(X_train) - y_train) ** 2))
        LAS_RMSE_OS = np.sqrt(np.mean((laregr.predict(X_test) - y_test) ** 2))
        LAS_RMSE_PS = np.sqrt(np.mean((laregr.predict(X_predict) - y_predict) ** 2))
        
        riregr = Ridge(alpha=RIDGE_ALPHA)
        riregr.fit(X_train, y_train)

        RID_RMSE_IS = np.sqrt(np.mean((riregr.predict(X_train) - y_train) ** 2))
        RID_RMSE_OS = np.sqrt(np.mean((riregr.predict(X_test) - y_test) ** 2))
        RID_RMSE_PS = np.sqrt(np.mean((riregr.predict(X_predict) - y_predict) ** 2))

        all_tract[str(num)+'_'+i] = [list(select_cols), LIN_MODEL, LIN_RMSE_IS, LIN_RMSE_OS, LIN_RMSE_PS, 
                                     LASSO_ALPHA, LAS_RMSE_IS, LAS_RMSE_OS, LAS_RMSE_PS,
                                     RIDGE_ALPHA, RID_RMSE_IS, RID_RMSE_OS, RID_RMSE_PS]


1
2
3
4
5
6
7
8
9
10


In [34]:
# values in non alpha columns represent root mean square error

rmse_tract = pd.DataFrame.from_dict(all_tract).T.reset_index()

rmse_tract.columns = ['#_FS_Technique', 'features', 
                       'LinModel', 'LinReg2013', 'LinReg2014', 'LinReg2015',
                       'LasAlpha', 'LasReg2013', 'LasReg2014', 'LasReg2015',
                       'RidAlpha', 'RidReg2013', 'RidReg2014', 'RidReg2015']

rmse_tract.sort_values(by='RidReg2015')


Unnamed: 0,#_FS_Technique,features,LinModel,LinReg2013,LinReg2014,LinReg2015,LasAlpha,LasReg2013,LasReg2014,LasReg2015,RidAlpha,RidReg2013,RidReg2014,RidReg2015
0,10_Corr.,"[landuse_09, bldg_class_Q1, res_unit_ratio, bl...",gas_incidents_per_bldg_unit = - 0.2*landuse_09...,0.423103,0.580291,0.734314,0.0437672,0.534936,0.734145,0.896919,0.996587,0.42609,0.588316,0.743419
6,10_Ridge,"[bldg_class_Q0, bldg_class_U5, DOB_permit_EQ, ...",gas_incidents_per_bldg_unit = + 2.06*bldg_clas...,0.420826,0.580419,0.739346,0.0318017,0.534936,0.734145,0.896919,0.973975,0.422448,0.585981,0.74459
69,9_Ridge,"[bldg_class_Q0, bldg_class_U5, DOB_permit_EQ, ...",gas_incidents_per_bldg_unit = + 2.03*bldg_clas...,0.42382,0.584249,0.742772,0.0217756,0.500455,0.695077,0.857199,0.984821,0.42586,0.59053,0.747889
63,9_Corr.,"[landuse_09, bldg_class_Q1, res_unit_ratio, bl...",gas_incidents_per_bldg_unit = + 0.83*landuse_0...,0.425648,0.585563,0.743642,0.0302395,0.522133,0.719889,0.882535,0.983056,0.42753,0.591119,0.748188
62,8_Ridge,"[bldg_class_Q0, bldg_class_U5, DOB_permit_EQ, ...",gas_incidents_per_bldg_unit = + 2.31*bldg_clas...,0.433866,0.596751,0.743137,0.0258924,0.515018,0.712122,0.874697,0.999207,0.434957,0.600868,0.74979
55,7_Ridge,"[bldg_class_Q0, bldg_class_U5, DOB_permit_EQ, ...",gas_incidents_per_bldg_unit = + 2.33*bldg_clas...,0.434643,0.597722,0.743434,0.0238005,0.507356,0.703285,0.865666,0.991385,0.435601,0.601573,0.750075
35,5_Corr.,"[landuse_09, bldg_class_Q1, res_unit_ratio, bl...",gas_incidents_per_bldg_unit = + 0.91*landuse_0...,0.437129,0.601848,0.748782,0.0436147,0.534936,0.734145,0.896919,0.948582,0.438005,0.604563,0.75363
48,6_Ridge,"[bldg_class_Q0, bldg_class_U5, DOB_permit_EQ, ...",gas_incidents_per_bldg_unit = + 2.59*bldg_clas...,0.437451,0.602841,0.749645,0.0335422,0.534936,0.734145,0.896919,0.987193,0.438473,0.60622,0.755443
56,8_Corr.,"[landuse_09, bldg_class_Q1, res_unit_ratio, bl...",gas_incidents_per_bldg_unit = + 0.75*landuse_0...,0.434734,0.598005,0.753474,0.0331413,0.526949,0.725371,0.888107,0.997672,0.435763,0.601028,0.756363
41,5_Ridge,"[bldg_class_Q0, bldg_class_U5, DOB_permit_EQ, ...",gas_incidents_per_bldg_unit = + 2.54*bldg_clas...,0.438592,0.604261,0.751148,0.0355549,0.534936,0.734145,0.896919,0.991576,0.439578,0.6075,0.756756
