In [119]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import seaborn as sns

In [120]:
df=pd.read_csv('50_startups.csv')

## Data Preprocessing

### Using label encoder and one hot encoderm

In [121]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [122]:
data = df.copy()
data['State'] = le.fit_transform(data['State'])

In [123]:
# Splitting into dependent and independent variables
x = data[['R&D Spend' , 'Administration' , 'Marketing Spend' , 'State']].values
y = data[['Profit']].values

In [124]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

x[:,3] = le.fit_transform(x[:,3])
ct     = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder = 'passthrough')
x      = np.array(ct.fit_transform(x), dtype = np.float64)

In [125]:
# To avoid Dummy variable trap
x=x[:,1:]

## Splitting the dataset into Training and Testing Data-Set

In [126]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=2)

## Creating the model

In [127]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Testing the model

In [128]:
y_predict = lr.predict(x_test)

In [129]:
print("The Predicted value from the model : ")
print(y_predict)

The Predicted value from the model : 
[[ 74061.28471133]
 [ 46009.2379876 ]
 [ 99637.26360759]
 [155786.53229373]
 [127636.76349538]
 [192765.18597813]
 [ 63906.99972423]
 [ 54935.14415853]
 [ 84532.35238006]
 [109460.29711993]]


In [130]:
print('Actual value from the observation : ')
print(y_test)

Actual value from the observation : 
[[ 90708.19]
 [ 42559.73]
 [103282.38]
 [149759.96]
 [134307.35]
 [192261.83]
 [ 65200.33]
 [ 49490.75]
 [ 81005.76]
 [108733.99]]


## The model's accuracy is 95%

In [132]:
print('Overall accuracy : ',lr.score(x,y))
print("Traning accuracy : ",lr.score(x_train,y_train))
print("Traning accuracy : ",lr.score(x_test,y_test))

Overall accuracy :  0.9503791173130328
Traning accuracy :  0.9398417195515445
Traning accuracy :  0.9783259006626758


## Preparation for Backward Elimination

In [104]:
import statsmodels.api as smf

In [105]:
## Adding the column of feature

In [106]:
x = np.append(np.ones((50,1)).astype(int),values=x,axis=1)

## By executing the above line of code, a new column will be added into our matrix of features, which will have all values equal to 1.

## Backward Elimination

![image.png](attachment:image.png)

In [107]:
# y = c*x0 + m1*x1 + m2*x2 + ..+mn*xn By default there is x0 which is 0, so the intercept is gone
# But inorder to hold the intercept, we are making x0=1. Henceforth the intercept is hold

x_opt  = x[:,[0,1,2,3,4,5]]      # Independent variable along wiht the intercept
lr_OLS = smf.OLS(endog=y , exog=x_opt).fit()      
        # An intercept is not included by default and should be added by the user.
        # endog : For dependent varaible and exog: For independent variable

In [108]:
lr_OLS.summary()   # const --> is the term we generated using np.append() and np.numpy()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 19 May 2020",Prob (F-statistic):,1.34e-27
Time:,16:42:52,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [109]:
# In the above summary, the highest P value is 0.990(99%) which is holded by x2, So remove x2
x_opt = x[:,[0,1,3,4,5]]
lr_OLS = smf.OLS(endog=y , exog=x_opt).fit()
lr_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Tue, 19 May 2020",Prob (F-statistic):,8.49e-29
Time:,16:42:52,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [110]:
# In the above summary, x1 holds the highest p value 0.940(94%). So remove x1

x_opt = x[:,[0,3,4,5]]
lr_OLS = smf.OLS(endog=y , exog=x_opt).fit()
lr_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Tue, 19 May 2020",Prob (F-statistic):,4.53e-30
Time:,16:42:52,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [111]:
# In the above summary, x2 holds the highest p value 0.602(60%). So remove x2

x_opt = x[:,[0,3,5]]
lr_OLS = smf.OLS(endog=y , exog=x_opt).fit()
lr_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 19 May 2020",Prob (F-statistic):,2.1600000000000003e-31
Time:,16:42:53,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [112]:
# In the above summary, x2 holds the highest p value 0.600(60%). So remove x2

x_opt = x[:,[0,3]]
lr_OLS = smf.OLS(endog=y , exog=x_opt).fit()
lr_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Tue, 19 May 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,16:42:53,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


### As we can see in the above output image, only two variables are left.
### So only the R&D independent variable is a significant variable for the prediction.
### So we can now predict efficiently using this variable.

In [141]:
x = df[['R&D Spend']].values
y = df[['Profit']].values

In [142]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [143]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)
y_pred

array([[104667.27805998],
       [134150.83410578],
       [135207.80019517],
       [ 72170.54428856],
       [179090.58602508],
       [109824.77386586],
       [ 65644.27773757],
       [100481.43277139],
       [111431.75202432],
       [169438.14843539]])

In [144]:
y_test

array([[103282.38],
       [144259.4 ],
       [146121.95],
       [ 77798.83],
       [191050.39],
       [105008.31],
       [ 81229.06],
       [ 97483.56],
       [110352.25],
       [166187.94]])

In [None]:
## Without applying Backward elimination

##  Overall accuracy :  0.9503791173130328
## Traning accuracy  :  0.9398417195515445
## Traning accuracy  :  0.9783259006626758

# Difference bt train and test is 0.0384841811111313

In [147]:
## With applying Backward elimination (cell : )
print('Overall score : ',lr.score(x,y))
print('Train score   : ',lr.score(x_train , y_train))
print('Test  score   : ',lr.score(x_test  , y_test))

# Difference bt train and test is 0.0014997829424175‬

Overall score :  0.9461105837157772
Train score   :  0.9449589778363044
Test  score   :  0.9464587607787219


# We got this result by using one independent variable (R&D spend) only instead of four variables. Hence, now, our model is simple and accurate.

In [149]:
ANALYSIS:

more

SyntaxError: invalid syntax (<ipython-input-149-dd5100a7575d>, line 1)