<a href="https://colab.research.google.com/github/Shivakumar-DS/NPN/blob/master/Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Feature Selection Techniques**
1. Recursive Feature Elimation
2.SelectKBest
3.LassoRegression

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE, SelectKBest
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston
from xgboost import XGBClassifier

In [0]:
boston_data = load_boston()

In [0]:
features = boston_data.data
target = boston_data.target

In [0]:
boston_df = pd.DataFrame(features,columns=boston_data.feature_names)

In [11]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [0]:
#Converting array to Series, as it is not pandas array
target_series = pd.Series(target)

In [0]:
# Concating features and target to Dataframe
boston_new_df = pd.concat([boston_df,pd.DataFrame(target_series)],axis=1)

In [18]:
boston_new_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,0
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [0]:
#Check the correlation
df_corr = abs(boston_new_df.corr())

In [0]:
#Checking which features are highly correlated
#Returns True if they are correlated else False
df_corr =[abs(df_corr)>0.40]

In [22]:
df_corr

[          CRIM     ZN  INDUS   CHAS    NOX  ...    TAX  PTRATIO      B  LSTAT      0
 CRIM      True  False   True  False   True  ...   True    False  False   True  False
 ZN       False   True   True  False   True  ...  False    False  False   True  False
 INDUS     True   True   True  False   True  ...   True    False  False   True   True
 CHAS     False  False  False   True  False  ...  False    False  False  False  False
 NOX       True   True   True  False   True  ...   True    False  False   True   True
 RM       False  False  False  False  False  ...  False    False  False   True   True
 AGE      False   True   True  False   True  ...   True    False  False   True  False
 DIS      False   True   True  False   True  ...   True    False  False   True  False
 RAD       True  False   True  False   True  ...   True     True   True   True  False
 TAX       True  False   True  False   True  ...   True     True   True   True   True
 PTRATIO  False  False  False  False  False  ...   Tru

In [0]:
#Creating Linear Regression object
linear = LinearRegression()

In [0]:
#Using RFE model to select top 6 features, we can increase depending on your use case
rfe_model = RFE(linear,n_features_to_select=6)

In [25]:
#Fitting the above model
rfe_model.fit(boston_df, target_series)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=6, step=1, verbose=0)

In [26]:
#We can see that 3rd, 4th, 5th, 7th, 10th, 12th features are highly ranked (1)
rfe_model.ranking_

array([3, 5, 4, 1, 1, 1, 8, 1, 2, 6, 1, 7, 1])

In [27]:
#We can check Ranking or Support to select best features
rfe_model.support_

array([False, False, False,  True,  True,  True, False,  True, False,
       False,  True, False,  True])

In [28]:
boston_df.columns[rfe_model.support_]

Index(['CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT'], dtype='object')

In [0]:
# Let us use RandomForestRegressor to see which best features it will give
rfm = RandomForestRegressor()

In [30]:
rfm.fit(boston_df, target_series)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [40]:
#Feature Importance in descending order we can select top 6 or 8 depends
feature_importance = pd.DataFrame(rfm.feature_importances_,columns=['Importances_RFM'],index=boston_df.columns)
feature_importance.sort_values(['Importances_RFM'],ascending=False)

Unnamed: 0,Importances_RFM
RM,0.439611
LSTAT,0.365183
DIS,0.065366
CRIM,0.036794
NOX,0.023579
PTRATIO,0.02017
AGE,0.012833
TAX,0.01261
B,0.011423
INDUS,0.005914


In [0]:
# Check with XGBOOST
xgb = XGBClassifier()

In [34]:
xgb.fit(boston_df, target_series)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [41]:
#Feature Importance in descending order we can select top 6 or 8 depends
feature_importance_xgb = pd.DataFrame(xgb.feature_importances_,columns=['Importances_XGB'],index=boston_df.columns)
feature_importance_xgb.sort_values(['Importances_XGB'],ascending=False)

Unnamed: 0,Importances_XGB
CHAS,0.089555
LSTAT,0.084717
NOX,0.084124
ZN,0.0836
INDUS,0.083398
TAX,0.076351
PTRATIO,0.075782
CRIM,0.075745
RM,0.07482
RAD,0.072269


In [0]:
#Comparing RFM and XGB
compare_feature = pd.concat([feature_importance,feature_importance_xgb],axis=1)

In [44]:
compare_feature.sort_values(['Importances_XGB'],ascending=False)

Unnamed: 0,Importances_RFM,Importances_XGB
CHAS,0.001336,0.089555
LSTAT,0.365183,0.084717
NOX,0.023579,0.084124
ZN,0.001011,0.0836
INDUS,0.005914,0.083398
TAX,0.01261,0.076351
PTRATIO,0.02017,0.075782
CRIM,0.036794,0.075745
RM,0.439611,0.07482
RAD,0.004168,0.072269
