In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import pickle
%matplotlib inline

In [2]:
startupData=pd.read_csv('50_Startups.csv')

FileNotFoundError: [Errno 2] File 50_Startups.csv does not exist: '50_Startups.csv'

In [3]:
startupData.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [11]:
#Seperate the data as features and label
#Regression -- Sklearn expects your data (feature and label to be two dimensional)
features = startupData.iloc[:,[0,1,2,3]].values
label = startupData.iloc[:,[4]].values
featureDF=pd.DataFrame(data=features, index=np.arange(len(features)),columns=["R&D Spend","Administration","Marketing Spend","State"])


In [12]:
ct=make_column_transformer((OneHotEncoder(),["State"]),remainder = 'passthrough')
features = ct.fit_transform(featureDF)

In [13]:
#Feature Selection -----= Feature Engineering
#Selecting the best feature that impact the quality of the model

# Method1: Using RFE (Recursive Feature Elimination)

In [14]:
#Recursive Feature Elimination can be applied to the following algorithms
#1. Based on Co-eff -----> Regression (LinearRegression, Support Vector Regression, 
#                        DecisionTreeRegression, RandomForestRegression)
#2. Based on feature importance ---> Classification (DecisionTreeClassifier, RandomForestClassifier)

#RFE expects your data to be NUMERIC
#[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,1.1698380e+05, 4.5173060e+04]
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

#Steps to apply RFE
# 1. Initalize the Algorithm 
# 2. Apply RFE on model
# 3. Interpret feature with higher ranking

In [17]:
model= LinearRegression()
#Eliminating one feature per iteration
selectFeatures=RFE(estimator=model,step=1)
#Testing must be done with entire data and not train test split
selectFeatures.fit(features,label)

RFE(estimator=LinearRegression())

In [19]:
# 3. Interpret feature with higher ranking
print(selectFeatures.ranking_)
# Mark Spending has more effect according to RFE output
print(selectFeatures.support_)
# False means eleminate output i.e according to RFE if we select States as feture profit will be more i.e produce good model 
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

[1 1 1 2 3 4]
[ True  True  True False False False]


# Method 2 - Univariate Analysis using ANOVA

In [21]:
# 2. Checking Each Feature's Variance 
# Check ANOVA for any Supervised Learning algorithms
#from sklearn.feature_selection import SelectPercentile
#from sklearn.feature_selection import f_regression # -------> For Regression
#from sklearn.feature_selection import f_classif ------->classification Algo

In [24]:
#for classification: score_func=f_classif
selectFeatures = SelectPercentile(percentile=50,score_func=f_regression)
selectFeatures.fit(features,label)
finalFeaturesANOVA = selectFeatures.transform(features)

  return f(*args, **kwargs)


In [26]:
print("Total features {}, After Anova {}".format(features.shape,finalFeaturesANOVA.shape))
print(selectFeatures.get_support())
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

Total features (50, 6), After Anova (50, 3)
[False False False  True  True  True]


# Method 3 - Select Features by Model

In [29]:
#Model Tells which feture they are comfortable

In [28]:
selectFeatures = SelectFromModel(model)

selectFeatures.fit(features,label)

selectFeatures.get_support()
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark

array([ True,  True,  True, False, False, False])

# Feature Elemination using OLS Backward Elimination Technique

In [34]:
X_train,X_test,y_train,y_test=train_test_split(features,
                                              label,
                                              test_size=0.2,
                                              random_state=1)

In [35]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [88]:
print("Training Score : {}".format(model.score(X_train,y_train)))
print("Test Score : {}".format(model.score(X_test,y_test)))


Training Score : 0.942446542689397
Test Score : 0.9649618042060633


In [89]:
#Step1 - Prepare Feature ---- Intercept coeff is missing . So add the same manually
#[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,1.1698380e+05, 4.5173060e+04]
# California   ,  Florida     ,  NY          , RDSpend      , Admin       , Mark



In [90]:
#50 of 1 1D Array
np.ones(50)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [103]:
#50 of ones 2d Array
np.ones((50,1))

In [102]:
finalFeatures = np.append(np.ones((50,1)),features, axis= 1)
# InteceptCoeff_, California,  Florida,  NewYork,    RDSpend,   Administration,   Marketing Spend
#[ 1.0,               0.0,      0.0,      1.0,        165349.2,      136897.8,      471784.1]
#[ const,              x1,       x2,       x3,              x4,            x5,            x6]

In [101]:
#Step2: Apply OLS

#Iteration 1 

# ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to 
#any supported types according to the casting rule ''safe''
#If you are getting the above mentioned error, you can solve it by specifying dtype for the np.array
finalFeatures = np.array((finalFeatures), dtype=float)

#Creating OLS model ---- LinearRegressionModel ----  to check the statistical summary
#OLS(endog means label column, exog means feature column with intercept coeff)
model1=sm.OLS(endog=label, exog=finalFeatures).fit()
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Fri, 06 Aug 2021",Prob (F-statistic):,1.34e-27
Time:,00:47:30,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.763e+04,5073.636,7.417,0.000,2.74e+04,4.79e+04
x1,1.249e+04,2449.797,5.099,0.000,7554.868,1.74e+04
x2,1.269e+04,2726.700,4.654,0.000,7195.596,1.82e+04
x3,1.245e+04,2486.364,5.007,0.000,7439.285,1.75e+04
x4,0.8060,0.046,17.369,0.000,0.712,0.900
x5,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x6,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,2.69e+17


In [None]:
# InteceptCoeff_, California,  Florida,  NewYork,    RDSpend,   Administration,   Marketing Spend
#[ 1.0,               0.0,      0.0,      1.0,        165349.2,      136897.8,      471784.1]
#[ const,              x1,       x2,       x3,              x4,            x5,            x6]

# We can see that P value x5(Administration)	0.608	 is highest so we can eleminate this feture
#Conclusion is to eliminate x5 --- Eliminating Adminstration

In [105]:
#Iteration 2 after  eliminate x5 --- Eliminating Adminstration we run OLS again and Generate Model
newFeatures = np.array((finalFeatures[:,[0,1,2,3,4,6]]), dtype=float) #no 5 feture Removed from Array
model1 = sm.OLS(endog=label, exog=newFeatures).fit() #to create the equation
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,215.8
Date:,"Fri, 06 Aug 2021",Prob (F-statistic):,9.720000000000001e-29
Time:,00:57:32,Log-Likelihood:,-525.53
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1071.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.525e+04,2100.376,16.782,0.000,3.1e+04,3.95e+04
x1,1.171e+04,1910.312,6.130,0.000,7861.854,1.56e+04
x2,1.185e+04,2170.903,5.459,0.000,7477.785,1.62e+04
x3,1.169e+04,1988.428,5.879,0.000,7684.996,1.57e+04
x4,0.7967,0.042,18.771,0.000,0.711,0.882
x5,0.0298,0.016,1.842,0.072,-0.003,0.062

0,1,2,3
Omnibus:,14.64,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.037
Skew:,-0.938,Prob(JB):,2.7e-05
Kurtosis:,5.565,Cond. No.,6.69e+17


In [None]:
# InteceptCoeff_, California,  Florida,  NewYork,    RDSpend,      Marketing Spend
#[ 1.0,               0.0,      0.0,      1.0,        165349.2,        471784.1]
#[ const,              x1,       x2,       x3,              x4,              x5]

# We can see that P value x5(Marketing Spend)	0.072	 is highest so we can eleminate this feture
#Conclusion is to eliminate x5 --- Marketing Spend

In [108]:
#Conclusion is eliminate x5 ---> Marketing Spend
#Iteration 3
newFeatures = np.array((newFeatures[:,[0,1,2,3,4]]), dtype=float) 
model1 = sm.OLS(endog=label, exog=newFeatures).fit() #to create the equation
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Fri, 06 Aug 2021",Prob (F-statistic):,2.76e-29
Time:,01:18:13,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.686e+04,1959.786,18.806,0.000,3.29e+04,4.08e+04
x1,1.189e+04,1956.677,6.079,0.000,7955.697,1.58e+04
x2,1.306e+04,2122.665,6.152,0.000,8785.448,1.73e+04
x3,1.19e+04,2036.022,5.847,0.000,7805.580,1.6e+04
x4,0.8530,0.030,28.226,0.000,0.792,0.914

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,3.7e+17


In [None]:
#Since all p valus are 0 so feature elemination is requied