In [1]:
# Implementing multiple linear regression

In [2]:
# Data Preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values


# Here we have one categorical section i.e state and we need to encode that before splitting up the dataset.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

#Avoiding the duymmy variable trap
X = X[:, 1:]


# Splitting the dataset into the Training set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

# Feature Scaling
'''from sklearn.preprocessing import StandartScaler
sc_X = StandardScaler()
X_train = sc_X.fit(X_train)
X_test = sc_X.transform(X_test)'''


'from sklearn.preprocessing import StandartScaler\nsc_X = StandardScaler()\nX_train = sc_X.fit(X_train)\nX_test = sc_X.transform(X_test)'

In [3]:
# Fitting Multiple Linear Regression to the Training set.
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [4]:
#Predicting
y_pred = regressor.predict(X_test)
y_pred

array([103015.20159796, 132582.27760816, 132447.73845175,  71976.09851259,
       178537.48221054, 116161.24230163,  67851.69209676,  98791.73374688,
       113969.43533012, 167921.0656955 ])

In [5]:
# Building the optimal model using Backward Elimination
'''We have learned about multiple linear regression and predicted values of dependent variables based on 
multiple independent variables. However how can we identify the impact made by a specific independent variable 
on dependent variable? We can follow backward elimination for multiple linear regression 
to identify independent variables which have most impact on dependent variables.'''

'''
We need to add a coloumn of ones in order to proceed further because backward elimaination will be done using statsmodel library
which does not this automatically like linear model library.
genearry regression equation is something like this y = b0 + b1x1 + b2x2 + ....bnxn.
So satsmodel library will ignore this b0 and in order to avoid that we add a coloumn of ones so that our 
equation becomes
 y = b0x0 + b1x1 + b2x2 + ....bnxn wheere x0 is coloumn of one.
'''

'\nWe need to add a coloumn of ones in order to proceed further because backward elimaination will be done using statsmodel library\nwhich does not this automatically like linear model library.\ngenearry regression equation is something like this y = b0 + b1x1 + b2x2 + ....bnxn.\nSo satsmodel library will ignore this b0 and in order to avoid that we add a coloumn of ones so that our \nequation becomes\n y = b0x0 + b1x1 + b2x2 + ....bnxn wheere x0 is coloumn of one.\n'

In [None]:
import statsmodels.formula.api as sm

# axis = 1 means coloumn
# X = np.append(arr = X, values = np.ones((50,1)), axis=1) 
# this above line will add a coloumn of 1 in the end but we need to add in the start.
X = np.append(arr = np.ones((50,1)).astype(int), values=X, axis=1)


In [8]:
# X_opt will have all independent variables which eventually have maximym impact
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
# we are going to create a new regressor here. This one from statsmodel library. we are doing this in order to fit x_opt and make our model more optimal
# it is from ols class
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
regressor_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 19 Jun 2018",Prob (F-statistic):,1.34e-27
Time:,16:49:36,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [10]:
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
regressor_ols.summary() 


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Tue, 19 Jun 2018",Prob (F-statistic):,8.49e-29
Time:,17:20:18,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [12]:
X_opt = X[:, [0, 3, 4, 5]]
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
regressor_ols.summary() 

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Tue, 19 Jun 2018",Prob (F-statistic):,4.53e-30
Time:,17:21:30,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [16]:
X_opt = X[:, [0, 3, 5]]
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
regressor_ols.summary() 

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 19 Jun 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,17:23:42,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0
