# Multiple Linear Regression Pred. Model

# Author - Amresh Mallick

In [111]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

# Load DataSet

In [76]:
df = pd.read_csv('Startup.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# Declare variables 

In [77]:
real_x = df.iloc[:,0:4].values
real_y = df.iloc[:,4].values
real_x

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [78]:
real_y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

# Apply Label Encoding  in col_ 3

In [79]:
le = LabelEncoder()
real_x[:,3] = le.fit_transform(real_x[:,3])
one_HE = OneHotEncoder()
real_x = one_HE.fit_transform(real_x).toarray()
real_x

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [81]:
real_x = real_x[:,1:]

# Training and Testing of Data

In [82]:
training_x,test_x,training_y,test_y = train_test_split(real_x,real_y,test_size = 0.2, 
                                                      random_state = 0)
training_x
# test vs train ratio is 20:80

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

# Training State of variables

In [83]:
MLR = LinearRegression()
MLR.fit(training_x,training_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Predication values

In [84]:
pred_y = MLR.predict(test_x)

In [85]:
pred_y  # pred. values by our model

array([107705.39463769,  95234.51853618, 105427.60785688,  95573.32650472,
       107947.57711205, 119692.1869409 , 129982.82378769, 134935.25630936,
       100747.44559631, 107947.57711205])

In [86]:
test_y # compare the value of test_y with pred_y.

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

In [87]:
MLR.coef_  # B1

array([ -9948.7974978 , -21162.9975427 ,  11752.6910622 ,  15484.61911154,
       -37103.36095092,  17188.01942076, -12374.25060733,   3241.20468161,
        17267.68569599,  -5673.717017  ,  -3862.6402232 , -14056.89459462,
          782.45855991,  -5951.54130743,  20437.05163322,  -7561.2229084 ,
        13058.82950427,   4336.71270227,  -2600.74415359,   -242.18247436,
        -4765.03419868,   5193.98226476,  -7200.13151575,   -733.42794541,
        17987.40226837,  -7880.82203004,  13246.22371396,   -505.83754946,
        21000.75882038,  -3800.24493615,  18228.9311628 ,  15259.58343209,
         3262.06267722,    416.08867022,  -2519.96925518,  13710.90502619,
        15503.95736574,  15387.76056802,  19664.67014293,  15935.00762932,
        14164.30510795,  20434.69337801,      0.        ,  22801.26177462,
            0.        ,  32324.54337801,  25921.20844129,   -503.32482138,
       -42457.71932464,      0.        ,  -4069.47642448,      0.        ,
        -1980.90004076,  

In [88]:
MLR.intercept_  # B0, y = B0 + B1x +.......

112646.22063516265

In [89]:
# y = b0 + b1x1 + b2x2 +.....+ bnxn.

In [90]:
real_x = np.append(arr = np.ones((50,1)).astype(int),values = real_x, axis = 1)

In [91]:
x_opt = real_x[:,[0,1,2,3,4]]

In [115]:
# OLS = ordinary least sq., endog = o/p, exog = optimazation
reg_OLS = sm.OLS(endog= real_y, exog=x_opt).fit()

In [118]:
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.12
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,1.54
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,0.207
Time:,19:05:58,Log-Likelihood:,-597.45
No. Observations:,50,AIC:,1205.0
Df Residuals:,45,BIC:,1214.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.16e+05,5816.040,19.942,0.000,1.04e+05,1.28e+05
x1,-5.106e+04,3.99e+04,-1.281,0.207,-1.31e+05,2.93e+04
x2,-6.649e+04,3.99e+04,-1.668,0.102,-1.47e+05,1.38e+04
x3,-4.622e+04,3.99e+04,-1.159,0.252,-1.27e+05,3.41e+04
x4,-3.475e+04,3.99e+04,-0.872,0.388,-1.15e+05,4.56e+04

0,1,2,3
Omnibus:,1.11,Durbin-Watson:,0.223
Prob(Omnibus):,0.574,Jarque-Bera (JB):,0.429
Skew:,-0.119,Prob(JB):,0.807
Kurtosis:,3.386,Cond. No.,7.38


In [119]:
x_opt = real_x[:,[0,1,2,3,4]]
reg_OLS = sm.OLS(endog= real_y, exog=x_opt).fit()
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.12
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,1.54
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,0.207
Time:,19:35:43,Log-Likelihood:,-597.45
No. Observations:,50,AIC:,1205.0
Df Residuals:,45,BIC:,1214.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.16e+05,5816.040,19.942,0.000,1.04e+05,1.28e+05
x1,-5.106e+04,3.99e+04,-1.281,0.207,-1.31e+05,2.93e+04
x2,-6.649e+04,3.99e+04,-1.668,0.102,-1.47e+05,1.38e+04
x3,-4.622e+04,3.99e+04,-1.159,0.252,-1.27e+05,3.41e+04
x4,-3.475e+04,3.99e+04,-0.872,0.388,-1.15e+05,4.56e+04

0,1,2,3
Omnibus:,1.11,Durbin-Watson:,0.223
Prob(Omnibus):,0.574,Jarque-Bera (JB):,0.429
Skew:,-0.119,Prob(JB):,0.807
Kurtosis:,3.386,Cond. No.,7.38


In [120]:
x_opt = real_x[:,[0,1,2,3]]
reg_OLS = sm.OLS(endog= real_y, exog=x_opt).fit()
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.047
Method:,Least Squares,F-statistic:,1.809
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,0.159
Time:,19:37:01,Log-Likelihood:,-597.87
No. Observations:,50,AIC:,1204.0
Df Residuals:,46,BIC:,1211.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.152e+05,5738.788,20.082,0.000,1.04e+05,1.27e+05
x1,-5.032e+04,3.98e+04,-1.266,0.212,-1.3e+05,2.97e+04
x2,-6.575e+04,3.98e+04,-1.654,0.105,-1.46e+05,1.43e+04
x3,-4.548e+04,3.98e+04,-1.144,0.259,-1.26e+05,3.45e+04

0,1,2,3
Omnibus:,0.765,Durbin-Watson:,0.19
Prob(Omnibus):,0.682,Jarque-Bera (JB):,0.209
Skew:,-0.073,Prob(JB):,0.901
Kurtosis:,3.281,Cond. No.,7.3


In [121]:
x_opt = real_x[:,[0,1,2]]
reg_OLS = sm.OLS(endog= real_y, exog=x_opt).fit()
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.041
Method:,Least Squares,F-statistic:,2.046
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,0.141
Time:,19:37:32,Log-Likelihood:,-598.57
No. Observations:,50,AIC:,1203.0
Df Residuals:,47,BIC:,1209.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.143e+05,5697.315,20.061,0.000,1.03e+05,1.26e+05
x1,-4.937e+04,3.99e+04,-1.238,0.222,-1.3e+05,3.09e+04
x2,-6.481e+04,3.99e+04,-1.625,0.111,-1.45e+05,1.54e+04

0,1,2,3
Omnibus:,0.411,Durbin-Watson:,0.121
Prob(Omnibus):,0.814,Jarque-Bera (JB):,0.045
Skew:,-0.029,Prob(JB):,0.978
Kurtosis:,3.135,Cond. No.,7.22


In [122]:
x_opt = real_x[:,[0,2,3]]
reg_OLS = sm.OLS(endog= real_y, exog=x_opt).fit()
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.074
Model:,OLS,Adj. R-squared:,0.035
Method:,Least Squares,F-statistic:,1.889
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,0.163
Time:,19:37:49,Log-Likelihood:,-598.72
No. Observations:,50,AIC:,1203.0
Df Residuals:,47,BIC:,1209.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.142e+05,5714.924,19.982,0.000,1.03e+05,1.26e+05
x1,-6.47e+04,4e+04,-1.617,0.112,-1.45e+05,1.58e+04
x2,-4.444e+04,4e+04,-1.111,0.272,-1.25e+05,3.6e+04

0,1,2,3
Omnibus:,0.375,Durbin-Watson:,0.177
Prob(Omnibus):,0.829,Jarque-Bera (JB):,0.036
Skew:,-0.032,Prob(JB):,0.982
Kurtosis:,3.115,Cond. No.,7.22


In [123]:
x_opt = real_x[:,[0,1]]
reg_OLS = sm.OLS(endog= real_y, exog=x_opt).fit()
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,1.404
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,0.242
Time:,19:38:09,Log-Likelihood:,-599.93
No. Observations:,50,AIC:,1204.0
Df Residuals:,48,BIC:,1208.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.13e+05,5734.430,19.701,0.000,1.01e+05,1.25e+05
x1,-4.805e+04,4.05e+04,-1.185,0.242,-1.3e+05,3.35e+04

0,1,2,3
Omnibus:,0.115,Durbin-Watson:,0.099
Prob(Omnibus):,0.944,Jarque-Bera (JB):,0.007
Skew:,-0.015,Prob(JB):,0.996
Kurtosis:,2.949,Cond. No.,7.15


In [124]:
x_opt = real_x[:,[0]]
reg_OLS = sm.OLS(endog= real_y, exog=x_opt).fit()
reg_OLS.summary()

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,
Time:,19:38:24,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.12e+05,5700.155,19.651,0.000,1.01e+05,1.23e+05

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,1.0
