In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
#data set of startups using variables R&D Spend, Administration, Marketing Spend, State and Profit
dataset = pd.read_csv('50_Startups.csv')
#independent variables are all columns but last
X = dataset.iloc[:, :-1].values
#dependent variable is last (profit)
y = dataset.iloc[:, 4].values

dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# Encoding categorical data (state - column index 3)
# transforms categorical entries to 1-0 columns (LabelEncoder to numbers then OneHotEncoder to columns)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder()
X = onehotencoder.fit_transform(X).toarray()

In [5]:

# Avoiding the Dummy Variable Trap
# one of the dummy variables is implied by others so remove one(any one will do)
#dummy columns put at beginning by encoder
#1: takes all columns inc index 1 (i.e. all but 0)
X = X[:, 1:]

In [6]:
# Splitting the dataset into the Training set and Test set
# Use 80% of data as training set and 20% as test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [7]:
# No Feature Scaling LinearRegression class takes care of it
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train = sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train = sc_y.fit_transform(y_train)'

In [8]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [9]:
# Predicting the Test set results
#these can then be manually inspected in IDE's explorer by correlating with y_test (rows are in same order)
#script doesn't do the comparison
y_pred = regressor.predict(X_test)

In [10]:

# Building the optimal model using Backward Elimination
# eliminate variables that are not stastically significant
import statsmodels.formula.api as sm
#statsmodels wants a constant term in the model but all of columns in X are currently variables
#so put an integer 1 (any constant will do) in all 50 rows for this new column (added vertically - axis has to be specified)
#actually strictly we're using append to add X to the column of 1s and then assign the result back to X
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)

In [11]:
#going to filter X_opt down to the optimal set of features - start with all of them
X_opt = X[:, [0, 1, 2, 3, 4, 5]]

In [13]:
import statsmodels.api as sm

#create an Ordinary Least Squares implementation of the model using OLS (passing in feature data set and target data set)
#need this other implementation in order to find the highest p-values for pruning
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [14]:
#summary shows p-values
regressor_OLS.summary()
#manual inspection of summary data shows that column 2 should be removed
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.169
Model:,OLS,Adj. R-squared:,0.095
Method:,Least Squares,F-statistic:,2.286
Date:,"Mon, 21 Jun 2021",Prob (F-statistic):,0.0747
Time:,14:11:22,Log-Likelihood:,-596.03
No. Observations:,50,AIC:,1202.0
Df Residuals:,45,BIC:,1212.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.166e+05,5653.397,20.628,0.000,1.05e+05,1.28e+05
x1,-8.095e+04,3.88e+04,-2.089,0.042,-1.59e+05,-2883.631
x2,-6.713e+04,3.88e+04,-1.732,0.090,-1.45e+05,1.09e+04
x3,-4.686e+04,3.88e+04,-1.209,0.233,-1.25e+05,3.12e+04
x4,-3.539e+04,3.88e+04,-0.913,0.366,-1.13e+05,4.27e+04

0,1,2,3
Omnibus:,0.992,Durbin-Watson:,0.475
Prob(Omnibus):,0.609,Jarque-Bera (JB):,0.329
Skew:,-0.022,Prob(JB):,0.848
Kurtosis:,3.395,Cond. No.,7.38


In [15]:
#repeat getting summary p-value data, manually inspect and remove hightest p-value column - this time 1
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.063
Model:,OLS,Adj. R-squared:,0.023
Method:,Least Squares,F-statistic:,1.588
Date:,"Mon, 21 Jun 2021",Prob (F-statistic):,0.215
Time:,14:11:33,Log-Likelihood:,-599.02
No. Observations:,50,AIC:,1204.0
Df Residuals:,47,BIC:,1210.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.14e+05,5749.145,19.821,0.000,1.02e+05,1.26e+05
x1,-6.447e+04,4.02e+04,-1.602,0.116,-1.45e+05,1.65e+04
x2,-3.273e+04,4.02e+04,-0.813,0.420,-1.14e+05,4.82e+04

0,1,2,3
Omnibus:,0.287,Durbin-Watson:,0.14
Prob(Omnibus):,0.866,Jarque-Bera (JB):,0.017
Skew:,-0.031,Prob(JB):,0.992
Kurtosis:,3.065,Cond. No.,7.22


In [16]:

#column 5 also has a p-value of more than 5% (taken as significance level) so eliminating that too 
#variables remaining in optimised model then have p-values below 5%
#column 3 is R.D.Spend. (Column 0 is actually just the constant 1s column we had to add.)
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.05
Model:,OLS,Adj. R-squared:,0.03
Method:,Least Squares,F-statistic:,2.532
Date:,"Mon, 21 Jun 2021",Prob (F-statistic):,0.118
Time:,14:11:42,Log-Likelihood:,-599.37
No. Observations:,50,AIC:,1203.0
Df Residuals:,48,BIC:,1207.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.133e+05,5670.069,19.980,0.000,1.02e+05,1.25e+05
x1,-6.38e+04,4.01e+04,-1.591,0.118,-1.44e+05,1.68e+04

0,1,2,3
Omnibus:,0.164,Durbin-Watson:,0.114
Prob(Omnibus):,0.921,Jarque-Bera (JB):,0.001
Skew:,0.009,Prob(JB):,1.0
Kurtosis:,2.989,Cond. No.,7.15
