In [61]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [62]:
#Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
dataset.head(3)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39


In [63]:
# Creating a matrix of features for independent variable, and vector of dependent variable
# Indexes in python start with zero
# Removing last column of dataset, which is dependent variable
x = dataset.iloc[:, :-1].values
y = dataset.iloc[: , 4].values

In [64]:
print('x shape (matrix of feature): ', x.shape)
print('y shape (vector): ', y.shape)

x shape (matrix of feature):  (50, 4)
y shape (vector):  (50,)


In [65]:
#Encoding independent variable 'state'
# LabelEncoder encodes categorical variable 'state' text to numbers
# OneHotEncoder remove any relational order
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_state = LabelEncoder()
x[:, 3] = labelencoder_state.fit_transform(x[:, 3])
onehotencoder_state = OneHotEncoder(categorical_features=[3])
x = onehotencoder_state.fit_transform(x).toarray()

In [66]:
# Avoiding dummy variable trap
# Exclude zero index include all columns starting from 1, remove redundant dependency
# Not required as library will take care of this
x = x[: ,1:]

In [67]:
# Building optimal Model using Backward Elimination
import statsmodels.formula.api as sm
# Stats model api doesnot take into account the intercept b0 in the metrix of features of independent variable
# Add column of 1s for x0, that is 1 for coef b0
# x is 50 rows
# axis = 1 add a column
# Add 1 as end of matrix x, so inverse the arr and values
#x = np.append(arr = x, values = np.ones((50, 1)).astype(int), axis = 1)

# 1s column will apeear before matrix of features
x = np.append(arr = np.ones((50, 1)).astype(int), values = x, axis = 1)


In [68]:
# Create optimal matrix of features
# x_opt will contain only those independent variable that has high impact on dependent variable 'profit'
# Backward elimination , include all independent variable at first and then remove one by one based on max pvalue
# i.e remove independent variable that are not statistically significant

# add all columns of independent variables
x_opt = x[:,[0,1,2,3,4,5]]

# endog is dependent variable and exog is independent variable
# fit OLS model to y and x_opt
regressor_OLS = sm.OLS(endog = y,exog = x_opt).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 23 Dec 2018",Prob (F-statistic):,1.34e-27
Time:,14:57:55,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [69]:
# max pvalue is of independent variable x2 i.e. 0.990
# we remove x2 i.e index 2 from model and redo the regression

x_opt = x[:, [0,1,3,4,5]]

regressor_OLS = sm.OLS(endog = y, exog= x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sun, 23 Dec 2018",Prob (F-statistic):,8.49e-29
Time:,14:58:12,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [70]:
# max pvalue is of independent variable x1 i.e. 0.940
# we remove x1 i.e index 1 from model and redo the regression

x_opt = x[:, [0,3,4,5]]

regressor_OLS = sm.OLS(endog = y, exog= x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sun, 23 Dec 2018",Prob (F-statistic):,4.53e-30
Time:,14:58:40,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [71]:
# max pvalue is of independent variable x2 i.e. 0.602
# we remove x2 i.e index 2 from model and redo the regression

x_opt = x[:, [0,3,5]]

regressor_OLS = sm.OLS(endog = y, exog= x_opt).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sun, 23 Dec 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,14:59:14,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [72]:
# max pvalue is of independent variable x2 i.e. 0.060
# we remove x2 i.e index 2 from model and redo the regression

x_opt = x[:, [0,3]]

regressor_OLS = sm.OLS(endog = y, exog= x_opt).fit()
regressor_OLS.summary()

#R&D Spent is very significant factor for profit generation.

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 23 Dec 2018",Prob (F-statistic):,3.5000000000000004e-32
Time:,14:59:56,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
