In [None]:
#Steps :
#1: Grab The Data
#2: Separate the Data into Dependant and Independent Variables
#3: Deal with Missing values (Impute/Drop) ** Not Required in this case **
#4: Encode Non Numeric categorical Data and create dummy variables
#5: Eliminate the Non Influencing factors (independent variables) 
#6: Feature Scale ** Not Used in this case **
#7: Apply Dimensionality Reduction ** Not Used in this case **
#8: Divide the data into training and testing data
#9: Build the Model, Check the Co-Efficient and the Intercepts
#10: Run The Model on Test Data using K-Fold Cross Val 
#11: Tune the Hyperparameters of the algorithm and Go to Step 9 till you find satisfactory accuracy
#12: Interpret the Results

In [6]:
# Multiple Linear Regression

# Importing the libraries
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
dataset.head() ## Looking at top 5 records

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [7]:
X = dataset.iloc[:, :-1].values ## Grab everything except Profit, Independent Variable
y = dataset.iloc[:, 4].values  ## Grab Profit, which is dependant variable as it is dependant on R&D, Admin, Marketing, State etc

print(X[0:5]) ## print First 5 rows
print("\n")
print(y[0:5])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]


[192261.83 191792.06 191050.39 182901.99 166187.94]


In [8]:
## Lets see how many unique states exist
unique=dataset.State.unique()
print(unique)

count=dataset.State.unique().size
print(count)

['New York' 'California' 'Florida']
3


In [9]:
# Encoding categorical data : State in this case
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3]) ## X is encoded, '3' indicates that 3rd column to be encoded

print("After encoding:",X[0:5])

After encoding: [[165349.2 136897.8 471784.1 2]
 [162597.7 151377.59 443898.53 0]
 [153441.51 101145.55 407934.54 1]
 [144372.41 118671.85 383199.62 2]
 [142107.34 91391.77 366168.42 1]]


In [10]:
onehotencoder = OneHotEncoder(categorical_features = [3]) ## column of the categorical column
X = onehotencoder.fit_transform(X).toarray() ## to create dummy variables
print(X[0:5]) ## Dummy variables appear as first columns followed by other columns

TypeError: __init__() got an unexpected keyword argument 'categorical_features'

In [11]:
# Avoiding the Dummy Variable Trap
X = X[:, 1:] ## Remove the first column of X, ie, at index 0
print(X[0:5])

[[136897.8 471784.1 2]
 [151377.59 443898.53 0]
 [101145.55 407934.54 1]
 [118671.85 383199.62 2]
 [91391.77 366168.42 1]]


In [None]:
# Feature Scaling (Multiple Linear Regression libraries do it automatically)
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)
#sc_y = StandardScaler()
#y_train = sc_y.fit_transform(y_train)

In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(X_train[0:5])

[[  1.00000000e+00   0.00000000e+00   5.54939500e+04   1.03057490e+05
    2.14634810e+05]
 [  0.00000000e+00   1.00000000e+00   4.60140200e+04   8.50474400e+04
    2.05517640e+05]
 [  1.00000000e+00   0.00000000e+00   7.53288700e+04   1.44135980e+05
    1.34050070e+05]
 [  0.00000000e+00   0.00000000e+00   4.64260700e+04   1.57693920e+05
    2.10797670e+05]
 [  1.00000000e+00   0.00000000e+00   9.17491600e+04   1.14175790e+05
    2.94919570e+05]]


In [9]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Intercept ## Mean value of Y when X=0 [constant=when all independent variables are zero]
print("Intercept:",regressor.intercept_)

# Co-Efficient of each Variable
print("Regression Coeff:",regressor.coef_) 

Intercept: 42554.1676177
Regression Coeff: [ -9.59284160e+02   6.99369053e+02   7.73467193e-01   3.28845975e-02
   3.66100259e-02]


In [10]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
print(y_pred)
print("\n")
print(y_test)

[ 103015.20159796  132582.27760815  132447.73845175   71976.09851258
  178537.48221056  116161.24230166   67851.69209676   98791.73374687
  113969.43533013  167921.06569551]


[ 103282.38  144259.4   146121.95   77798.83  191050.39  105008.31
   81229.06   97483.56  110352.25  166187.94]


In [11]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 94.6% of the variance or variability of the data is explained by the model

MAE: 7514.29365964
MSE: 83502864.0326
RMSE: 9137.99015279
Explained Variance Score: 94.6919285865


In [12]:
from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction

r2: 0.934706847328
adj_r2= 0.853090406489


In [None]:
from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_median_absolute_error') 
print (regression_avg.mean())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print (regression_avg.mean()*100)

# Predicting against Real Dataset

In [None]:
dataset_test = pd.read_csv('Startups_Test_Samp.csv')
dataset_test.head()

In [None]:
X_test_samp = dataset_test.iloc[:, :-1].values ## Grab everything except Profit
y_test_samp = dataset_test.iloc[:, 4].values  ## Grab Profit

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X_test_samp[:, 3] = labelencoder.fit_transform(X_test_samp[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X_test_samp = onehotencoder.fit_transform(X_test_samp).toarray()

# Avoiding the Dummy Variable Trap
X_test_samp = X_test_samp[:, 1:]
X_test_samp

In [None]:
y_test_pred = regressor.predict(X_test_samp)
y_test_pred

# Building the Optimal Model using Backward Elimination

In [None]:
import statsmodels.formula.api as sm

In [None]:
## Multiple Linear Regression : y=b0+b1x1+b2x2+......+bnxn
## Add x0 which is linked with variable b0 and always equals to 1.
#X=np.append(arr=X,values=np.ones((50,1)).astype(int), axis=1) ## Add 1; 50 times; axis=1 as it is a row
X = dataset.iloc[:, :-1].values ## Grab everything except Profit
y = dataset.iloc[:, 4].values  ## Grab Profit


# Encoding categorical data : Country in this case
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()

X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray() ## to remove relationship

# Avoiding the Dummy Variable Trap
X = X[:, 1:] ## Remove the first column of X, ie, at index 0
X[0:5]

## y=bo+b1x1+bnxn can also be written as :
## y=boxo+b1x1+bnxn [assuming x0=1]
X=np.append(arr=np.ones((50,1)).astype(int),values=X, axis=1) ## axis=1 means add columns

In [None]:
print(dataset.head())
print("\n")
print(X[0:5])

In [None]:
#X_Opt=X[:,[0,1,2,3,4,5]]
#X_Opt=X[:,]
#X_Opt

In [None]:
regressor_OLS=sm.OLS(endog=y,exog=X).fit()

In [None]:
## ## Significance Level = 0.05 or 5%
## Lower the P-Value , better it is
## Lower P Value, reject the null hypothesuis, accept alternate hypothesis.
## Alternate hypothesis= This particular variable is influencing the outcome
## Find out the independent variable with highest P-Value
## If the variable with highest P-value is greater than Significance level, drop that variable
## Perform this exericse till the indepedent variable with highest P-Value is lower than significance level
## x1=New York
##x2=California
## x3=R&D Spend
## x4=Admin Spend
##x5=Marketing Spend
regressor_OLS.summary()                  

In [None]:
X=X[:,[0,1,3,4,5]] ## REmoved x2=California, highest P-value
X[0]
 

##x1=New York
## x2=R&D Spend
## x3=Admin Spend
##x4=Marketing Spend

In [None]:
print("\n")
regressor_OLS=sm.OLS(endog=y,exog=X).fit()
regressor_OLS.summary()

In [None]:
X=X[:,[0,2,3,4]] ## REmoved x1=New York, highest P-value.
regressor_OLS=sm.OLS(endog=y,exog=X).fit()
regressor_OLS.summary() 

## x1=R&D Spend
## x2=Admin Spend
## x3=Marketing Spend

In [None]:
X=X[:,[0,1,3]] ## Removed x2=Admin Spend, highest P-value; (3 and 5: R&D Spend and Marketing Spend)
regressor_OLS=sm.OLS(endog=y,exog=X).fit()
regressor_OLS.summary() 

In [None]:
## x1=R&D Spend
## x2=Marketing Spend
X=X[:,[0,1]] ## Removed x2=Marketing Spend, highest P-value; (Left Over; x2: R&D Spend )
regressor_OLS=sm.OLS(endog=y,exog=X).fit()
regressor_OLS.summary() 

In [None]:
print(dataset.head())
print("\n")
print(X[0])

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
print(X_train[0:5])
print(y_train[0:5])

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
print(y_pred)
print("\n")
print(y_test)

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 95.9% of the variance is explained by the model



from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_mean_absolute_error') 
print ("Cross val Mean Abs Error:",regression_avg.mean())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print ("Cross Val Explained Variance:",regression_avg.mean()*100)


from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction

In [None]:
# Including Marketing Spend 

In [None]:
## Fitting the model with training data of significant independent variable
print(dataset.head())
X = dataset.iloc[:, :-1].values ## Grab everything except Profit
X=X[:,[0,2]] ## Grabbing R&D Spend and Marketing Spend
X[0]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 95.9% of the variance is explained by the model


from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_mean_absolute_error') 
print ("Cross Val Mean Absolute Error",regression_avg.mean())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print ("Cross Val Explained Variance:",regression_avg.mean()*100)

from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction

# Code to automate Backward Elimination

In [None]:
X = dataset.iloc[:, :-1].values ## Grab everything except Profit
y = dataset.iloc[:, 4].values  ## Grab Profit
X[0:5]

In [None]:
## Multiple Linear Regression : y=b0+b1x1+b2x2+......+bnxn
## Add x0 which is linked with variable b0 and always equals to 1.
#X=np.append(arr=X,values=np.ones((50,1)).astype(int), axis=1) ## Add 1; 50 times; axis=1 as it is a row

# Encoding categorical data : Country in this case
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()

X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray() ## to remove relationship

# Avoiding the Dummy Variable Trap
X = X[:, 1:] ## Remove the first column of X, ie, at index 0
X[0:5]

X=np.append(arr=np.ones((50,1)).astype(int),values=X, axis=1)
X[0:5]

In [None]:
        import statsmodels.formula.api as sm
        def backwardElimination(x, sl):
            numVars = len(x[0])
            for i in range(0, numVars):
                regressor_OLS = sm.OLS(y, x).fit()
                maxVar = max(regressor_OLS.pvalues).astype(float)
                if maxVar > sl:
                    for j in range(0, numVars - i):
                        if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                            x = np.delete(x, j, 1)
            regressor_OLS.summary()
            return x
         
        SL = 0.05
        X = backwardElimination(X, SL)

In [None]:
print(dataset.head())
print("\n")
print(X[0])

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

regressor_OLS=sm.OLS(endog=y_train,exog=X_train).fit()
y_pred = regressor_OLS.predict(X_test)
y_pred

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 95.9% of the variance is explained by the model


from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_mean_absolute_error') 
print ("Cross Val Mean Absolute Error",regression_avg.mean())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print ("Cross Val Explained Variance:",regression_avg.mean()*100)

from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction

# Code to Automate Backward Elimination with R-Squared

In [None]:
X = dataset.iloc[:, :-1].values ## Grab everything except Profit
y = dataset.iloc[:, 4].values  ## Grab Profit
X[0:5]

In [None]:
## Multiple Linear Regression : y=b0+b1x1+b2x2+......+bnxn
## Add x0 which is linked with variable b0 and always equals to 1.
#X=np.append(arr=X,values=np.ones((50,1)).astype(int), axis=1) ## Add 1; 50 times; axis=1 as it is a row
X = dataset.iloc[:, :-1].values ## Grab everything except Profit
y = dataset.iloc[:, 4].values  ## Grab Profit


# Encoding categorical data : Country in this case
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()

X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray() ## to remove relationship

# Avoiding the Dummy Variable Trap
X = X[:, 1:] ## Remove the first column of X, ie, at index 0
X[0:5]

X=np.append(arr=np.ones((50,1)).astype(int),values=X, axis=1)
X[0:5]

In [None]:
    ## Find the model using elimination + highest adjusted_r2    
    import statsmodels.formula.api as sm
        def backwardElimination(x, SL):
            numVars = len(x[0])
            temp = np.zeros((50,6)).astype(int) ## Only change this line
            for i in range(0, numVars):
                regressor_OLS = sm.OLS(y, x).fit()
                maxVar = max(regressor_OLS.pvalues).astype(float)
                adjR_before = regressor_OLS.rsquared_adj.astype(float)
                if maxVar > SL:
                    for j in range(0, numVars - i):
                        if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                            temp[:,j] = x[:, j]
                            x = np.delete(x, j, 1)
                            tmp_regressor = sm.OLS(y, x).fit()
                            adjR_after = tmp_regressor.rsquared_adj.astype(float)
                            if (adjR_before >= adjR_after):
                                x_rollback = np.hstack((x, temp[:,[0,j]]))
                                x_rollback = np.delete(x_rollback, j, 1)
                                print (regressor_OLS.summary())
                                return x_rollback
                            else:
                                continue
            regressor_OLS.summary()
            return x
         
        SL = 0.05
        X = backwardElimination(X , SL)

In [None]:
print(dataset.head())
print("\n")
print(X[0])

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

regressor_OLS=sm.OLS(endog=y_train,exog=X_train).fit()
y_pred = regressor_OLS.predict(X_test)
y_pred

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 95.9% of the variance is explained by the model


from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_mean_absolute_error') 
print ("Cross Val Mean Absolute Error",regression_avg.mean())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print ("Cross Val Explained Variance:",regression_avg.mean()*100)

from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction