In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectFromModel
import pickle
%matplotlib inline

In [2]:
startupData=pd.read_csv('50_Startups.csv')

In [3]:
startupData.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [11]:
#Seperate the data as features and label
#Regression -- Sklearn expects your data (feature and label to be two dimensional)
features = startupData.iloc[:,[0,1,2,3]].values
label = startupData.iloc[:,[4]].values
featureDF=pd.DataFrame(data=features, index=np.arange(len(features)),columns=["R&D Spend","Administration","Marketing Spend","State"])


In [12]:
ct=make_column_transformer((OneHotEncoder(),["State"]),remainder = 'passthrough')
features = ct.fit_transform(featureDF)

In [13]:
#Feature Selection -----= Feature Engineering
#Selecting the best feature that impact the quality of the model

# Method1: Using RFE (Recursive Feature Elimination)

In [14]:
#Recursive Feature Elimination can be applied to the following algorithms
#1. Based on Co-eff -----> Regression (LinearRegression, Support Vector Regression, 
#                        DecisionTreeRegression, RandomForestRegression)
#2. Based on feature importance ---> Classification (DecisionTreeClassifier, RandomForestClassifier)

#RFE expects your data to be NUMERIC
#[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,1.1698380e+05, 4.5173060e+04]
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

#Steps to apply RFE
# 1. Initalize the Algorithm 
# 2. Apply RFE on model
# 3. Interpret feature with higher ranking

In [17]:
model= LinearRegression()
#Eliminating one feature per iteration
selectFeatures=RFE(estimator=model,step=1)
#Testing must be done with entire data and not train test split
selectFeatures.fit(features,label)

RFE(estimator=LinearRegression())

In [19]:
# 3. Interpret feature with higher ranking
print(selectFeatures.ranking_)
# Mark Spending has more effect according to RFE output
print(selectFeatures.support_)
# False means eleminate output i.e according to RFE if we select States as feture profit will be more i.e produce good model 
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

[1 1 1 2 3 4]
[ True  True  True False False False]


# Method 2 - Univariate Analysis using ANOVA

In [21]:
# 2. Checking Each Feature's Variance 
# Check ANOVA for any Supervised Learning algorithms
#from sklearn.feature_selection import SelectPercentile
#from sklearn.feature_selection import f_regression # -------> For Regression
#from sklearn.feature_selection import f_classif ------->classification Algo

In [24]:
#for classification: score_func=f_classif
selectFeatures = SelectPercentile(percentile=50,score_func=f_regression)
selectFeatures.fit(features,label)
finalFeaturesANOVA = selectFeatures.transform(features)

  return f(*args, **kwargs)


In [26]:
print("Total features {}, After Anova {}".format(features.shape,finalFeaturesANOVA.shape))
print(selectFeatures.get_support())
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

Total features (50, 6), After Anova (50, 3)
[False False False  True  True  True]


# Method 3 - Select Features by Model

In [29]:
#Model Tells which feture they are comfortable

In [28]:
selectFeatures = SelectFromModel(model)

selectFeatures.fit(features,label)

selectFeatures.get_support()
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

array([ True,  True,  True, False, False, False])