# LASSO, Elastic Nets & stepwise regression

The purpose of this notebook is to test different models of feature selection using regression models.

In [4]:
import pandas as pd
import numpy as np
import scipy
import pylab as pl
import sklearn as sk
import seaborn as sns
import statsmodels.api as sm
from IPython.display import display
%matplotlib inline

import random
random.seed(1)



In [5]:

# import util file
import sys
# the file path is unique to my machine. ...
sys.path.insert(0,'/Users/arianisfeld/Documents/Presence Health/Healthviz/')
from analytics.util import drop_nan, impute, regression_split, impute_df

In [19]:

def re_df(dfs):
    '''
    Takes list of dataframes with matching indices and concatenates them
    Input:
        dfs (List-like of pd.DataFrames) with matching indices
    Returns:
        DataFrame with all the data together
    '''
    out_df = pd.DataFrame(index=dfs[0].index)
    for df in dfs:
        try:
            out_df = pd.concat([out_df, df], axis=1)
        except:
            print("pd.concat failed for {}".format(df.name))
            if out_df.index != df.index:
                print("DataFrames must have same index values")
            break
    return out_df

In [72]:
# import data
df = pd.read_csv('HealthViz County Dataset 6 21 17.csv',header=0, skiprows=[1],index_col=0,encoding='latin_1')
df.index.name=None
# convluded way to drop_nans
df.drop(["County"], axis=1, inplace=True)
df = re_df(regression_split(df=df,y_name=['MYH_2008-2014'],w_name='POP_2011-2015'))
df.head()

Unnamed: 0,POP-B_2011-2015,POP-H_2011-2015,POP-A_2011-2015,AGE_2011-2015,DEP_2011-2015,INC_2011-2015,POV_2011-2015,INQ_2011-2015,DUA_2010-2014,EDE_2011-2015,EDF_2011-2015,SNP_2011-2015,TNF_2011-2015,EEC_2016,EEG_2016,MYH_2008-2014,POP_2011-2015
1001,10315,1440,534,37.7,63.865397,51219.97561,12.879382,18.51,2.164335,23.230265,9.746964,13.777211,2.603452,24661.0,49.478934,21.5,55221
1003,18735,8776,1307,42.2,68.057087,50194.19774,13.411661,20.9,1.676687,28.98519,9.724017,9.003563,2.238746,94090.0,57.786162,17.5,195121
1005,12595,1241,244,38.8,59.653803,32924.77284,26.727439,18.7,4.737949,12.515951,5.008507,25.970505,1.984385,10390.0,5.611165,19.4,26932
1007,4846,502,21,38.9,56.645877,38631.97318,16.795878,21.11,4.4832,10.646874,4.594024,15.781984,7.59926,8748.0,55.544124,24.5,22604
1009,884,4980,83,40.7,67.20751,45758.48253,16.720933,16.23,3.031681,12.890784,4.273244,13.489623,1.104919,25384.0,81.381973,13.8,57710


In [None]:
X,y,w = regression_split(df=df,y_name=['Diabetes mortality (deaths per 100,000), 2008-2014'],w_name='Population (residents), 2011-2015')
Xt = X
Xt["rand1"] = np.random.choice(range(1, 12), Xt.shape[0])/15
Xt["rand2"] = np.random.choice(range(1, 3), Xt.shape[0])/5
Xt["rand3"] = np.random.choice(range(1, 3), Xt.shape[0])/2
Xt["rand4"] = np.random.choice(range(1, 3), Xt.shape[0])

In [146]:
import statsmodels.formula.api as sm
from operator import itemgetter

def forward_selected(X, y, w, delta):
    """adpated from http://trevor-smith.github.io/stepwise-post/
    Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """

    remaining = set(X.columns)
    
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            score = sm.WLS(y,X[selected + [candidate]],1/w).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
            #print(scores_with_candidates)
        best_new_score, best_candidate = max(scores_with_candidates, key=itemgetter(0))
        if current_score + delta < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    model = sm.WLS(y,X[selected],1/w).fit()
    return model

# Stepwise

In [149]:
r = forward_selected(Xt,y,w,0.001)
r.summary()

0,1,2,3
Dep. Variable:,"Diabetes mortality (deaths per 100,000), 2008-2014",R-squared:,0.889
Model:,WLS,Adj. R-squared:,0.888
Method:,Least Squares,F-statistic:,2470.0
Date:,"Thu, 20 Jul 2017",Prob (F-statistic):,0.0
Time:,14:31:24,Log-Likelihood:,-10108.0
No. Observations:,2482,AIC:,20230.0
Df Residuals:,2474,BIC:,20280.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
"Share of income, top 5% (% of total income), 2011-2015",0.4890,0.074,6.601,0.000,0.344,0.634
"Poverty rate (% of residents), 2011-2015",0.8603,0.040,21.617,0.000,0.782,0.938
"Graduate education rate (% of residents), 2011-2015",-1.0799,0.093,-11.662,0.000,-1.262,-0.898
"Median household income, 2011-2015",0.0003,2.81e-05,11.102,0.000,0.000,0.000
"Public assistance income (cash welfare) (% of households), 2011-2015",1.0393,0.131,7.923,0.000,0.782,1.297
"Total election votes (Presidential) (votes), 2016",-5.395e-05,8.02e-06,-6.728,0.000,-6.97e-05,-3.82e-05
"Median age, 2011-2015",-0.3601,0.041,-8.739,0.000,-0.441,-0.279
"Age dependency ratio (Dependents per 100 working-age adults), 2011-2015",0.1406,0.026,5.372,0.000,0.089,0.192

0,1,2,3
Omnibus:,1272.877,Durbin-Watson:,1.932
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15988.891
Skew:,2.124,Prob(JB):,0.0
Kurtosis:,14.686,Cond. No.,29700.0


In [173]:
all_covs = sm.OLS(y,Xt).fit()
all_covs.summary()

0,1,2,3
Dep. Variable:,"Diabetes mortality (deaths per 100,000), 2008-2014",R-squared:,0.895
Model:,OLS,Adj. R-squared:,0.894
Method:,Least Squares,F-statistic:,1044.0
Date:,"Thu, 20 Jul 2017",Prob (F-statistic):,0.0
Time:,14:42:23,Log-Likelihood:,-8991.1
No. Observations:,2482,AIC:,18020.0
Df Residuals:,2462,BIC:,18140.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
"Population, Non-Hispanic Black (residents), 2011-2015",7.161e-06,5.21e-06,1.375,0.169,-3.05e-06,1.74e-05
"Population, Hispanic or Latino (residents), 2011-2015",-3.086e-06,2.66e-06,-1.162,0.245,-8.29e-06,2.12e-06
"Population, Asian or Pacific Islander (residents), 2011-2015",1.754e-05,7.83e-06,2.239,0.025,2.18e-06,3.29e-05
"Median age, 2011-2015",-0.2655,0.044,-5.982,0.000,-0.353,-0.178
"Age dependency ratio (Dependents per 100 working-age adults), 2011-2015",0.1082,0.027,4.068,0.000,0.056,0.160
"Median household income, 2011-2015",0.0002,2.58e-05,8.155,0.000,0.000,0.000
"Poverty rate (% of residents), 2011-2015",0.5592,0.061,9.172,0.000,0.440,0.679
"Share of income, top 5% (% of total income), 2011-2015",0.3798,0.078,4.888,0.000,0.227,0.532
"Dual eligible coverage (% of residents), 2010-2014",-0.2034,0.187,-1.087,0.277,-0.570,0.163

0,1,2,3
Omnibus:,899.473,Durbin-Watson:,1.925
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5658.617
Skew:,1.574,Prob(JB):,0.0
Kurtosis:,9.694,Cond. No.,1920000.0


# Ridge

In [201]:
n = 0.1
ridge = sm.OLS(y,Xt).fit_regularized(alpha=1,L1_wt=0.000001)
ridge.params[abs(ridge.params) > n]

Age dependency ratio (Dependents per 100 working-age adults), 2011-2015    0.156118
Poverty rate (% of residents), 2011-2015                                   0.672637
Share of income, top 5% (% of total income), 2011-2015                    -0.223679
Dual eligible coverage (% of residents), 2010-2014                        -0.489183
College graduation rate (% of residents), 2011-2015                       -0.155990
Graduate education rate (% of residents), 2011-2015                       -0.307567
Food stamps (SNAP) (% of households), 2011-2015                            0.316653
Public assistance income (cash welfare) (% of households), 2011-2015       0.184637
dtype: float64

# Lasso

In [202]:
lasso = sm.OLS(y,Xt).fit_regularized(alpha=1,L1_wt=1)
lasso.params[abs(lasso.params) > n]

Age dependency ratio (Dependents per 100 working-age adults), 2011-2015    0.163500
Poverty rate (% of residents), 2011-2015                                   0.663310
Share of income, top 5% (% of total income), 2011-2015                    -0.153346
College graduation rate (% of residents), 2011-2015                       -0.208228
Graduate education rate (% of residents), 2011-2015                       -0.161666
Food stamps (SNAP) (% of households), 2011-2015                            0.232227
dtype: float64

# Elastic Net

In [200]:
enet = sm.OLS(y,Xt).fit_regularized(alpha=.5,L1_wt=0.5)
enet.params[abs(enet.params) > n]

Age dependency ratio (Dependents per 100 working-age adults), 2011-2015    0.158998
Poverty rate (% of residents), 2011-2015                                   0.680639
Share of income, top 5% (% of total income), 2011-2015                    -0.246035
College graduation rate (% of residents), 2011-2015                       -0.141355
Graduate education rate (% of residents), 2011-2015                       -0.324209
Food stamps (SNAP) (% of households), 2011-2015                            0.267074
dtype: float64

In [None]:
e = sm.OLS(y,Xt).fit_regularized(alpha=1,L1_wt=1)
e.params[abs(e.params) > .1]

In [176]:
f = sm.OLS(y,Xt).fit()
f.params[abs(f.params) > .0001]#- all_covs.params

Median age, 2011-2015                                                     -0.265478
Age dependency ratio (Dependents per 100 working-age adults), 2011-2015    0.108165
Median household income, 2011-2015                                         0.000211
Poverty rate (% of residents), 2011-2015                                   0.559227
Share of income, top 5% (% of total income), 2011-2015                     0.379845
Dual eligible coverage (% of residents), 2010-2014                        -0.203399
College graduation rate (% of residents), 2011-2015                        0.007331
Graduate education rate (% of residents), 2011-2015                       -0.775133
Food stamps (SNAP) (% of households), 2011-2015                            0.410656
Public assistance income (cash welfare) (% of households), 2011-2015       0.412568
Election margin, winner (Presidential) (% margin), 2016                    0.017472
rand                                                                       0