In [1]:
import numpy as np
import pandas as pd

In [2]:
startupData = pd.read_csv('50_Startups.csv')

In [3]:
#Seperate the data as features and label
#Regression -- Sklearn expects your data (feature and label to be two dimensional)
features = startupData.iloc[:,[0,1,2,3]].values
label = startupData.iloc[:,[4]].values

In [4]:
#Handle Categorical Data
#Sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
countryLabel = LabelEncoder()
features[:,3] = countryLabel.fit_transform(features[:,3])
countryOHE = OneHotEncoder(categorical_features=[3])
features = countryOHE.fit_transform(features).toarray()
features

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(features,
                                              label,
                                              test_size=0.2,
                                              random_state=1)

In [6]:
#Create the model(Training the model)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train) #Create the equation

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
model.score(X_train,y_train)

0.9424465426893971

In [8]:
model.score(X_test,y_test)

0.9649618042060633

# OLS Backward Elimination Technique

In [14]:
#Step1 - Prepare Feature ---- Intercept coeff is missing . So add the same manually
finalFeatures = np.append(np.ones((50,1)),features, axis= 1)

In [15]:
#Step2: Apply OLS

#Iteration 1 
import statsmodels.formula.api as sm
#Creating OLS model ---- LinearRegressionModel ----  to check the statistical summary
#OLS(endog means label column, exog means feature column with intercept coeff)

model1 = sm.OLS(endog=label, exog=finalFeatures).fit() #to create the equation
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 04 Aug 2019",Prob (F-statistic):,1.34e-27
Time:,21:58:42,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.763e+04,5073.636,7.417,0.000,2.74e+04,4.79e+04
x1,1.249e+04,2449.797,5.099,0.000,7554.868,1.74e+04
x2,1.269e+04,2726.700,4.654,0.000,7195.596,1.82e+04
x3,1.245e+04,2486.364,5.007,0.000,7439.285,1.75e+04
x4,0.8060,0.046,17.369,0.000,0.712,0.900
x5,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x6,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1.81e+17


In [16]:
#Conclusion is to eliminate x5 --- Eliminating Adminstration
#Iteration 2
newFeatures = finalFeatures[:,[0,1,2,3,4,6]]
model1 = sm.OLS(endog=label, exog=newFeatures).fit() #to create the equation
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,215.8
Date:,"Sun, 04 Aug 2019",Prob (F-statistic):,9.720000000000001e-29
Time:,22:01:06,Log-Likelihood:,-525.53
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1071.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.525e+04,2100.376,16.782,0.000,3.1e+04,3.95e+04
x1,1.171e+04,1910.312,6.130,0.000,7861.854,1.56e+04
x2,1.185e+04,2170.903,5.459,0.000,7477.785,1.62e+04
x3,1.169e+04,1988.428,5.879,0.000,7684.996,1.57e+04
x4,0.7967,0.042,18.771,0.000,0.711,0.882
x5,0.0298,0.016,1.842,0.072,-0.003,0.062

0,1,2,3
Omnibus:,14.64,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.037
Skew:,-0.938,Prob(JB):,2.7e-05
Kurtosis:,5.565,Cond. No.,1.74e+17


In [17]:
#Conclusion is eliminate x5 ---> Marketing
#Iteration 3
newFeatures = newFeatures[:,[0,1,2,3,4]]
model1 = sm.OLS(endog=label, exog=newFeatures).fit() #to create the equation
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Sun, 04 Aug 2019",Prob (F-statistic):,2.76e-29
Time:,22:02:24,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.686e+04,1959.786,18.806,0.000,3.29e+04,4.08e+04
x1,1.189e+04,1956.677,6.079,0.000,7955.697,1.58e+04
x2,1.306e+04,2122.665,6.152,0.000,8785.448,1.73e+04
x3,1.19e+04,2036.022,5.847,0.000,7805.580,1.6e+04
x4,0.8530,0.030,28.226,0.000,0.792,0.914

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,3.2e+17
