# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
'''splitting the dataset for the training and testing split'''

dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

## Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [5]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [7]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

## Predicting the Test set results

In [8]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) 

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


In [13]:
#!pip install statsmodels
'''Here importing the library to know which of our independent variables
are having the lowest significance to profit column which is our dependent variable
and applying backward elimination method to remove least significance column'''
import statsmodels.api as sm
X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis=1)  #adding the column for ones at 1st column in out dataset to satisfy multiple linear regression eqation
X_opt = X[:, [0, 1, 2, 3, 4, 5]].astype(int)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  #endog ->dependent variable ,exog = independent variable
regressor_OLS.summary()



0,1,2,3
Dep. Variable:,y,R-squared:,0.024
Model:,OLS,Adj. R-squared:,-0.018
Method:,Least Squares,F-statistic:,0.5748
Date:,"Thu, 25 Feb 2021",Prob (F-statistic):,0.567
Time:,00:26:26,Log-Likelihood:,-600.05
No. Observations:,50,AIC:,1206.0
Df Residuals:,47,BIC:,1212.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.364e+04,1725.786,19.495,0.000,3.02e+04,3.71e+04
x1,3.364e+04,1725.786,19.495,0.000,3.02e+04,3.71e+04
x2,3.364e+04,1725.786,19.495,0.000,3.02e+04,3.71e+04
x3,2974.4815,8105.911,0.367,0.715,-1.33e+04,1.93e+04
x4,1.784e+04,8254.519,2.162,0.036,1237.380,3.44e+04
x5,1.283e+04,8105.911,1.582,0.120,-3481.236,2.91e+04

0,1,2,3
Omnibus:,0.111,Durbin-Watson:,0.081
Prob(Omnibus):,0.946,Jarque-Bera (JB):,0.207
Skew:,0.104,Prob(JB):,0.902
Kurtosis:,2.762,Cond. No.,6.51e+17


In [15]:
X_opt = X[:, [0, 1, 2, 4, 5]].astype(int)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  #endog ->dependent variable ,exog = independent variable
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.024
Model:,OLS,Adj. R-squared:,-0.018
Method:,Least Squares,F-statistic:,0.5748
Date:,"Thu, 25 Feb 2021",Prob (F-statistic):,0.567
Time:,00:26:58,Log-Likelihood:,-600.05
No. Observations:,50,AIC:,1206.0
Df Residuals:,47,BIC:,1212.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.464e+04,3287.212,10.536,0.000,2.8e+04,4.12e+04
x1,3.464e+04,3287.212,10.536,0.000,2.8e+04,4.12e+04
x2,3.464e+04,3287.212,10.536,0.000,2.8e+04,4.12e+04
x3,1.487e+04,1.42e+04,1.050,0.299,-1.36e+04,4.34e+04
x4,9851.2712,1.39e+04,0.706,0.483,-1.82e+04,3.79e+04

0,1,2,3
Omnibus:,0.111,Durbin-Watson:,0.081
Prob(Omnibus):,0.946,Jarque-Bera (JB):,0.207
Skew:,0.104,Prob(JB):,0.902
Kurtosis:,2.762,Cond. No.,3.68e+16


In [16]:
X_opt = X[:, [0, 1, 2, 4]].astype(int)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  #endog ->dependent variable ,exog = independent variable
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.014
Model:,OLS,Adj. R-squared:,-0.007
Method:,Least Squares,F-statistic:,0.6575
Date:,"Thu, 25 Feb 2021",Prob (F-statistic):,0.421
Time:,00:31:14,Log-Likelihood:,-600.31
No. Observations:,50,AIC:,1205.0
Df Residuals:,48,BIC:,1208.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.628e+04,2312.246,15.689,0.000,3.16e+04,4.09e+04
x1,3.628e+04,2312.246,15.689,0.000,3.16e+04,4.09e+04
x2,3.628e+04,2312.246,15.689,0.000,3.16e+04,4.09e+04
x3,9943.2135,1.23e+04,0.811,0.421,-1.47e+04,3.46e+04

0,1,2,3
Omnibus:,0.077,Durbin-Watson:,0.058
Prob(Omnibus):,0.962,Jarque-Bera (JB):,0.123
Skew:,0.08,Prob(JB):,0.94
Kurtosis:,2.817,Cond. No.,2.61e+18


In [17]:
X_opt = X[:, [0, 1, 2]].astype(int)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()  #endog ->dependent variable ,exog = independent variable
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Thu, 25 Feb 2021",Prob (F-statistic):,
Time:,00:32:50,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04
x1,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04
x2,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,1.47e+32
