# Two most important resources for Multiple Regression 
<br> https://www.youtube.com/watch?v=fTfMdCQJz4s for lecture 
<br> https://www.youtube.com/watch?time_continue=1553&v=NUXdtN1W1FE for python tutorial 

In [1]:
# Importing the libraries 
import numpy as np # numerical computation 
import matplotlib.pyplot as plt # 2D plotting library  
import pandas as pd # for dataframe, 2 D data with rows and columns 
import seaborn as sns # Seaborn is a library for making statistical graphics in Python
# It is built on top of matplotlib and closely integrated with pandas data structures 
%matplotlib inline
# The output of plotting commands is displayed inline within frontends like the Jupyter notebook, 
# directly below the code cell that produced it. The resulting plots will then also be stored in the notebook document.

In [2]:
# Importing the dataset 
companies = pd.read_csv('1000_Companies.csv')
companies.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Extracting the Independent and Dependent variables
# By convention X should be uppercase and y should be lowercase 
X = companies.iloc[:, 0:4].values # all rows, columns 0,1,2,3
y = companies.iloc[:, 4].values # all rows, fifth column (starts from 0(R&D Spend))
print (X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 ...
 [100275.47 241926.31 227142.82 'California']
 [128456.23 321652.14 281692.32 'California']
 [161181.72 270939.86 295442.17 'New York']]


In [4]:
# Building the Correlation Matrix
companies.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.582434,0.978407,0.945245
Administration,0.582434,1.0,0.520465,0.74156
Marketing Spend,0.978407,0.520465,1.0,0.91727
Profit,0.945245,0.74156,0.91727,1.0


### We see that Marketing Spend and R&D Spend are extremly correlated (0.978). This will cause multi colinearity. We need to eliminate one of the independent variables. Let's eliminate Marketing Spend (0.917) and keep R&D Spend (0.945). 


In [5]:
X = companies.iloc[:, 1:4].values # Eliminating R&D Spend 
print (X)

[[136897.8 471784.1 'New York']
 [151377.59 443898.53 'California']
 [101145.55 407934.54 'Florida']
 ...
 [241926.31 227142.82 'California']
 [321652.14 281692.32 'California']
 [270939.86 295442.17 'New York']]


In [6]:
# Encoding categorical data of state (New York, California and Florida) into integers (0, 1 and 2) using LabelEncoder 
# LabelEncoder encodes the labels between 0 and n-1 classes 
# It help transform non-numerical labels to numerical labels
# It also helps to prepare the data for OneHotEncoding 
# Since OneHotEncoding takes only integer labels 

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder() # labelencoder object of class LabelEncoder 
X[:, 2] = labelencoder.fit_transform(X[:, 2]) # transforming into New York=0, California=1, Florida=2
print (X)

[[136897.8 471784.1 2]
 [151377.59 443898.53 0]
 [101145.55 407934.54 1]
 ...
 [241926.31 227142.82 0]
 [321652.14 281692.32 0]
 [270939.86 295442.17 2]]


In [7]:
# Encoding the integer labels into binary labels using OneHotEncoder 
# (New York, California and Florida) into integers (0, 1 and 2) using LabelEncoder
# and integers (0, 1 and 2) into binary (1,0), (0,1) and (0,0) using OneHotEncoder

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features =[2]) # categorical_features = column of dataframe that needs to be HotEncoded 
# Since 3rd (0,1,2) column has label encoded data in the dataFrame 
# Unlike labelEncoder, we have to give the entire dataFrame as a parameter to OneHotEncoder 
X = onehotencoder.fit_transform(X).toarray() # transform the encoded data to an array
print (X)

[[0.0000000e+00 0.0000000e+00 1.0000000e+00 1.3689780e+05 4.7178410e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.5137759e+05 4.4389853e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.0114555e+05 4.0793454e+05]
 ...
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 2.4192631e+05 2.2714282e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 3.2165214e+05 2.8169232e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 2.7093986e+05 2.9544217e+05]]


In [8]:
# One Hot Encoder automatically generates 1 extra dummy variable 
# We need only two dummy variables (0,1) for 3 types of categorical data (n-1), where n = no of categories 
# New York = 1, 0 California = 0, 1 Florida = 0, 0 
X = X[:, 1:] # all rows, 2nd to last column (0 is the 1st column)
# All this is doing is removing the first column 
print (X)

[[0.0000000e+00 1.0000000e+00 1.3689780e+05 4.7178410e+05]
 [0.0000000e+00 0.0000000e+00 1.5137759e+05 4.4389853e+05]
 [1.0000000e+00 0.0000000e+00 1.0114555e+05 4.0793454e+05]
 ...
 [0.0000000e+00 0.0000000e+00 2.4192631e+05 2.2714282e+05]
 [0.0000000e+00 0.0000000e+00 3.2165214e+05 2.8169232e+05]
 [0.0000000e+00 1.0000000e+00 2.7093986e+05 2.9544217e+05]]


In [9]:
# Splitting the dataset into the Training set and Test set 
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Convention --> (X,y) X_train, X_test, y_train, y_test 
# test_size = 0.2, 20% of the total data would be used for testing, rest for trianing 
# random_state:===================================================================================================== 
# If you don't specify the random_state in your code, then every time you run(execute) your code a new 
# random value is generated and the train and test datasets would have different values each time.
# However, if a fixed value is assigned like random_state = 42 (or 0) then no matter how many times
# you execute your code the result would be the same .i.e, same values in train and test datasets.
# In practice I would say, you should set the random_state to some fixed number while you test stuff, 
# but then remove it in production if you really need a random (and not a fixed) split.
# ==================================================================================================================

In [10]:
# Fitting Multiple Linear Regression to the Training Set 
from sklearn.linear_model import LinearRegression 
regressor = LinearRegression() # regressor object of the class LinearRegression 
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
# Predicting the Test set results 
y_pred = regressor.predict(X_test)
print (y_pred)
y_pred.size 

[ 90565.4549183   89231.66013825  95558.1615789  173980.70753522
  84392.40760654 110960.02030434 169853.27349786  92211.50361802
 163806.49460484  54194.35845003  68223.34420283 149414.20662372
 126880.09350258  60777.09313745 176374.873465    76473.57865297
 117801.00551914 163717.67368519 164505.66820023 180897.36578723
 101542.77105289  86534.69207536 179176.54723885  85043.09150246
 105448.72374492 101758.7541275   41156.33974605  58440.90914855
  70127.08647148 227178.56810887 120276.37402309 112021.1671163
 102074.21108737 137081.85170533  65308.13393229 109216.7567128
 184756.89248794 169834.94778712 172966.72131045 117236.93367575
  97421.45834873 164576.37223589 108107.91915771  51232.14223946
 116104.59322756  60087.11249188 157092.96939586  78960.6347705
 158375.5611409  130409.52059342 182979.83473803 172932.30629085
  94434.21503433  79432.43725437 179603.02250807  84751.6038107
 141915.91326953 169696.51197795  85324.97274627 105810.34725851
 140702.15163224  54181.83142

200

In [12]:
# Calculating the Coefficeints 
print (regressor.coef_)

[-1.46244030e+03 -6.16268772e+02  1.04077565e+00  3.55369298e-01]


### Note on number of coeffients and independent variables 
Remember there are 4 coefficients and one intercept. 
Since Marketing Spend is already eliminated 
<br> y   =  m1x1 + m2x2 + m3x3 + m4x4  + C 
<br> Profit = R&D Spend, Administration, State (3 different states) 
<br> We have three different states but we need only 2 coefficients ! 

In [13]:
# Calculating the Intercept 
print (regressor.intercept_)

-88119.45034079812


In [14]:
# Calculating the R squared value 
from sklearn.metrics import r2_score 
r2_score(y_test, y_pred)

0.8982282549585726

## Let's use Statsmodels and see the multiple regression summary  

In [15]:
import statsmodels.formula.api as smf 
df = pd.DataFrame(X_train, y_train) # Statsmodels takes only dataFrame 
reg = smf.ols(formula ='y_train~X_train', data=df).fit() 
# ols -->ordinary least squares or linear regression 
reg.summary()

0,1,2,3
Dep. Variable:,y_train,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,3480.0
Date:,"Wed, 08 Aug 2018",Prob (F-statistic):,0.0
Time:,09:09:58,Log-Likelihood:,-8481.2
No. Observations:,800,AIC:,16970.0
Df Residuals:,795,BIC:,17000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8.812e+04,3658.415,-24.087,0.000,-9.53e+04,-8.09e+04
X_train[0],-1462.4403,841.422,-1.738,0.083,-3114.111,189.231
X_train[1],-616.2688,841.847,-0.732,0.464,-2268.775,1036.238
X_train[2],1.0408,0.033,31.869,0.000,0.977,1.105
X_train[3],0.3554,0.004,80.273,0.000,0.347,0.364

0,1,2,3
Omnibus:,1073.779,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,597476.615
Skew:,6.564,Prob(JB):,0.0
Kurtosis:,136.237,Cond. No.,2860000.0
