In [2]:
#import important libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, accuracy_score
import math
import warnings
warnings.filterwarnings('ignore')

In [3]:
dataset = pd.read_csv("50_Startups.csv")
dataset = dataset.rename(index=str, columns={"Marketing Spend" : "Marketing_Spend"})
dataset.head()
dataset.shape

(50, 5)

In [4]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
R&D Spend,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
Administration,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
Marketing_Spend,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
Profit,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


In [5]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing_Spend    0
State              0
Profit             0
dtype: int64

In [6]:
#Splitting data
#independent variables are all columns but last
X = dataset.iloc[:, :-1].values
#dependent variable is last (profit)
y = dataset.iloc[:, 4].values

In [7]:
# Encoding categorical data (state - column index 3)
# transforms categorical entries to 1-0 columns (LabelEncoder to numbers then OneHotEncoder to columns)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In [8]:
#dummy columns put at beginning by encoder
#1: takes all columns inc index 1 (i.e. all but 0)
X = X[:, 1:]

In [9]:
#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [10]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
#getting predicted value
y_pred = regressor.predict(X_test)
y_pred

array([103015.20159795, 132582.27760817, 132447.73845176,  71976.09851257,
       178537.48221058, 116161.24230165,  67851.69209675,  98791.73374686,
       113969.43533013, 167921.06569553])

In [14]:
# Building the optimal model using Backward Elimination
# eliminate variables that are not stastically significant
import statsmodels.formula.api as sm
#statsmodels wants a constant term in the model but all of columns in X are currently variables
#so put an integer 1 (any constant will do) in all 50 rows for this new column (added vertically - axis has to be specified)
#actually strictly we're using append to add X to the column of 1s and then assign the result back to X
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
#going to filter X_opt down to the optimal set of features - start with all of them
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
#create an Ordinary Least Squares implementation of the model using OLS (passing in feature data set and target data set)
#need this other implementation in order to find the highest p-values for pruning
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
#summary shows p-values
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Wed, 06 Feb 2019",Prob (F-statistic):,1.34e-27
Time:,10:51:27,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [12]:
x_opt = x_data_set[:, [0,1,2,3,4,5]]
reg_ols = sm.OLS(endog= y_data_set, exog = x_opt).fit() # Fit ordinary least square to x_opt and y
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Mon, 04 Feb 2019",Prob (F-statistic):,1.34e-27
Time:,14:45:29,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [15]:
#manual inspection of summary data shows that column 2 should be removed
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
#repeat getting summary p-value data, manually inspect and remove hightest p-value column - this time 1
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()
#column 5 also has a p-value of more than 5% (taken as significance level) so eliminating that too 
#variables remaining in optimised model then have p-values below 5%
#column 3 is R.D.Spend. (Column 0 is actually just the constant 1s column we had to add.)
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Wed, 06 Feb 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,10:51:56,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [17]:
X_Optimal_Train, X_Optimal_Test = train_test_split(x_opt,test_size = 0.2, random_state = 0)
regression.fit(X_Optimal_Train, y_train_data)

# Predicting the Optimal Test set results

Y_Optimal_Pred = regression.predict(X_Optimal_Test)

In [25]:

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

def accuracy( y_data_test, y_pred_test):
    total_error = 0
    for i in range(0, len(y_data_test)):
        total_error += abs((y_pred_test[i] - y_data_test[i]) / y_data_test[i])
    total_error = (total_error / len(y_data_test))
    accuracy = 1 - total_error
    return accuracy * 100
print("Mean absolute error ",(mean_absolute_error(y_test_data,Y_Optimal_Pred)))

print("Test EVS: ", explained_variance_score(y_test_data, y_pred_test), 
      "Train EVS: ", explained_variance_score(y_train_data, y_pred_train))

print("accuracy test: ", accuracy(np.array(y_test_data), Y_Optimal_Pred))
# print("accuracy train: ", accuracy(np.array(y_train_data), Y_Optimal_Pred))

r_score_test = r2_score(y_test_data,y_pred_test) 
r_score_train = r2_score(y_train_data, y_pred_train) 

print("r_score_test : ",r_score_test)
print("r_score_train : ",r_score_train)
# dataset.describe()

Mean absolute error  6772.453280477901
Test EVS:  0.9469192858653288 Train EVS:  0.9501847627493607
accuracy test:  [94.09060628]
r_score_test :  0.9347068473283249
r_score_train :  0.9501847627493607
