In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data=pd.read_csv("50_Startups.csv")
data.head()
# """ Here the data set represents lists of investments, states amount shelled out by the admins
# on the upcoming startups and the amount which it made
# The objective here is to understand whether it is profitable in investing 
# in a startup or not

# Profit here is a dependent variable
# Rest of the variables are indepenent variable"""


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
data.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [4]:
#Multiple Linear is same as Linear Regression but with many variables
#IN this the input variables and the output variables are continuous in nature
#some input varibales can be discrete for these variables you need to convert the variables into dummy variables
#by discrete we mean the variables are not continuous but can be categorical in nature

X=data.iloc[:,:4].values
#X.head()
y=data.iloc[:, 4].values
#pd.get_dummies(X[:,3]).head()

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [6]:
labelencoder_X=LabelEncoder()
X[:, 3]=labelencoder_X.fit_transform(X[:, 3])
onehotencoder=OneHotEncoder(categorical_features=[3])
X=onehotencoder.fit_transform(X).toarray()
print(X)

[[  0.00000000e+00   0.00000000e+00   1.00000000e+00   1.65349200e+05
    1.36897800e+05   4.71784100e+05]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.62597700e+05
    1.51377590e+05   4.43898530e+05]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   1.53441510e+05
    1.01145550e+05   4.07934540e+05]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   1.44372410e+05
    1.18671850e+05   3.83199620e+05]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   1.42107340e+05
    9.13917700e+04   3.66168420e+05]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   1.31876900e+05
    9.98147100e+04   3.62861360e+05]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.34615460e+05
    1.47198870e+05   1.27716820e+05]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   1.30298130e+05
    1.45530060e+05   3.23876680e+05]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   1.20542520e+05
    1.48718950e+05   3.11613290e+05]
 [  1.00000000e+00   0.00000000e+00  

In [7]:
X=X[:,1:]

In [8]:
from sklearn.cross_validation import train_test_split



In [9]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
#Now apply linear model
from sklearn.linear_model import LinearRegression


In [11]:
regressor=LinearRegression()
regressor.fit(X_train, y_train) #fitting the regressor to the training set


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [12]:
#predicting the test set
y_pred=regressor.predict(X_test)

In [13]:
y_pred #predicted profit

array([ 103015.20159796,  132582.27760815,  132447.73845175,
         71976.09851258,  178537.48221056,  116161.24230166,
         67851.69209676,   98791.73374687,  113969.43533013,
        167921.06569551])

In [14]:
y_test #real profit

array([ 103282.38,  144259.4 ,  146121.95,   77798.83,  191050.39,
        105008.31,   81229.06,   97483.56,  110352.25,  166187.94])

### Backward Elimination

In [15]:
import statsmodels.formula.api as sm
# The equation of linear regression is y=b0 + b1x1 + b2x2+.... where b0 is the constant
# in the above model when we apply linear regression the model automatically adds constant b0 in the equation
# something which is not there in the stats model
# so we add matrix of 1 in frst column 

In [16]:
#X=np.append(arr=X, values=np.ones((50,1)).astype(int), axis=1) #50 rows that is the dimension of X and 1 column
#but in the line above we are adding one matrix at the end of X but we want to add it at first
X=np.append(arr=np.ones((50,1)).astype(int), values=X, axis=1)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,1.0,165349.2,136897.8,471784.1
1,1.0,0.0,0.0,162597.7,151377.59,443898.53
2,1.0,1.0,0.0,153441.51,101145.55,407934.54
3,1.0,0.0,1.0,144372.41,118671.85,383199.62
4,1.0,1.0,0.0,142107.34,91391.77,366168.42
5,1.0,0.0,1.0,131876.9,99814.71,362861.36
6,1.0,0.0,0.0,134615.46,147198.87,127716.82
7,1.0,1.0,0.0,130298.13,145530.06,323876.68
8,1.0,0.0,1.0,120542.52,148718.95,311613.29
9,1.0,0.0,0.0,123334.88,108679.17,304981.62


In [17]:
#now we will test and store only those variables that are statistically significant and we will store the value in table X optimal
X_opt=X[:, [0,1,2,3,4,5]]
#Step 2 Fit the full model with all possible predictors
regressor_OLS=sm.OLS(endog=y, exog=X_opt).fit() #endog is for deoendent variable and exog is the independent variable



In [18]:
#Step 3
print(regressor_OLS.summary()) #here we are checking for least significant variable and as we can see
#variable with index 2 has the highest p value so we first remove that index
#we now proceed to step 4 to remove that variable and then repeat the operation


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     169.9
Date:                Sat, 26 May 2018   Prob (F-statistic):           1.34e-27
Time:                        18:38:32   Log-Likelihood:                -525.38
No. Observations:                  50   AIC:                             1063.
Df Residuals:                      44   BIC:                             1074.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.013e+04   6884.820      7.281      0.0

In [19]:
X_opt=X[:, [0,1,3,4,5]]
regressor_OLS=sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     217.2
Date:                Sat, 26 May 2018   Prob (F-statistic):           8.49e-29
Time:                        18:38:32   Log-Likelihood:                -525.38
No. Observations:                  50   AIC:                             1061.
Df Residuals:                      45   BIC:                             1070.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.011e+04   6647.870      7.537      0.0

In [20]:
X_opt=X[:, [0,3,4,5]]
regressor_OLS=sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     296.0
Date:                Sat, 26 May 2018   Prob (F-statistic):           4.53e-30
Time:                        18:38:33   Log-Likelihood:                -525.39
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      46   BIC:                             1066.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.012e+04   6572.353      7.626      0.0

In [21]:
X_opt=X[:, [0,3,5]]
regressor_OLS=sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_OLS.summary())
#here we see that the highest p value of 6% so we will nw remove this indepenent variable

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     450.8
Date:                Sat, 26 May 2018   Prob (F-statistic):           2.16e-31
Time:                        18:38:33   Log-Likelihood:                -525.54
No. Observations:                  50   AIC:                             1057.
Df Residuals:                      47   BIC:                             1063.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.698e+04   2689.933     17.464      0.0

In [22]:
X_opt=X[:, [0,3]]
regressor_OLS=sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_OLS.summary())
#only one independent variable

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     849.8
Date:                Sat, 26 May 2018   Prob (F-statistic):           3.50e-32
Time:                        18:40:46   Log-Likelihood:                -527.44
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      48   BIC:                             1063.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.903e+04   2537.897     19.320      0.0

So R and D spend is the only independent variable is a strong predictor of the profit, the independent variable that can
statistically predict the high significance of profit