In [1]:
import numpy as np
import pandas as pd

In [4]:
startupData = pd.read_csv('50_Startups.csv')

In [5]:
#Seperate the data as features and label
#Regression -- Sklearn expects your data (feature and label to be two dimensional)
features = startupData.iloc[:,[0,1,2,3]].values
label = startupData.iloc[:,[4]].values

In [6]:
#Handle Categorical Data
#Sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
countryLabel = LabelEncoder()
features[:,3] = countryLabel.fit_transform(features[:,3])
countryOHE = OneHotEncoder(categorical_features=[3])
features = countryOHE.fit_transform(features).toarray()
features

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

In [7]:
#Feature Selection -----= Feature Engineering
#Selecting the best feature that impact the quality of the model

# Method1: Using RFE (Recursive Feature Elimination)

In [8]:
#Recursive Feature Elimination can be applied to the following algorithms
#1. Based on Co-eff -----> Regression (LinearRegression, Support Vector Regression, 
#                        DecisionTreeRegression, RandomForestRegression)
#2. Based on feature importance ---> Classification (DecisionTreeClassifier, RandomForestClassifier)

In [9]:
#RFE expects your data to be NUMERIC
#[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,1.1698380e+05, 4.5173060e+04]
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

In [10]:
#Steps to apply RFE
# 1. Initalize the Algorithm 
# 2. Apply RFE on model
# 3. Interpret feature with higher ranking

In [11]:
# 1. Initalize the Algorithm 
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [13]:
# 2. Apply RFE on model -- 
from sklearn.feature_selection import RFE
selectFeatures = RFE(estimator=model,
                    step=1) #Eliminating one feature per iteration

#Testing must be done with entire data and not train test split
selectFeatures.fit(features,label)

  y = column_or_1d(y, warn=True)


RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
  n_features_to_select=None, step=1, verbose=0)

In [15]:
# 3. Interpret feature with higher ranking
print(selectFeatures.ranking_)
print(selectFeatures.support_)
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

[1 1 1 2 3 4]
[ True  True  True False False False]


# Method 2 - Univariate Analysis using ANOVA

In [16]:
# 1. Initalize the Algorithm 
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [21]:
# 2. Checking Each Feature's Variance 
# Check ANOVA for any Supervised Learning algorithms
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression # -------> For Regression
#from sklearn.feature_selection import f_classif ------->classification Algo

selectFeatures = SelectPercentile(percentile=50, score_func=f_regression) #for classification: score_func=f_classif

selectFeatures.fit(features,label)

finalFeaturesANOVA = selectFeatures.transform(features)
print("Total features {}, After Anova {}".format(features.shape,finalFeaturesANOVA.shape))
print(selectFeatures.get_support())
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

Total features (50, 6), After Anova (50, 3)
[False False False  True  True  True]


  y = column_or_1d(y, warn=True)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


# Method 3 - Select Features by Model

In [18]:
# 1. Initalize the Algorithm 
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [20]:
#2. Apply Selectby Model

from sklearn.feature_selection import SelectFromModel

selectFeatures = SelectFromModel(model)

selectFeatures.fit(features,label)

selectFeatures.get_support()
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

array([ True,  True,  True, False, False, False])