## Multiple Linear Regression

### Multiple regression is to learn more about the relationship between several independent variables and a dependent variable.

<img src='images/multiple_reg.PNG' />

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### importing dataset

In [2]:
data = pd.read_csv('50_Startups.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
data.shape

(50, 5)

In [4]:
data.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [5]:
data.State.value_counts()   #nominal feature

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [6]:
data.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [7]:
#nominal feature :implement one hot encoding to convert categorical data into numeric
#ordinal feature: implement label encoding to convert this ordinal feature into numeric

### One hot encoding

In [8]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [9]:
#pd.get_dummies(data['State'])

In [10]:
d=pd.get_dummies(data['State'])  #function to create dummy variables

In [11]:
d.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [12]:
### display all numeric cols
#data.iloc[:,:-1].select_dtypes(exclude=['object'])     #int

In [13]:
#data.iloc[:,:-1].select_dtypes(exclude=['object'])


In [14]:
f=data.iloc[:,:-1].select_dtypes(exclude=['object'])

In [15]:
f.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [16]:
X=pd.concat((f,d),axis=1)

In [17]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,Florida,New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [18]:
y=data['Profit'].values
y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

In [19]:
X.shape

(50, 6)

### Split dataset

In [20]:
X.shape

(50, 6)

In [21]:
X.values

array([[1.6534920e+05, 1.3689780e+05, 4.7178410e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.6259770e+05, 1.5137759e+05, 4.4389853e+05, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.5344151e+05, 1.0114555e+05, 4.0793454e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.4437241e+05, 1.1867185e+05, 3.8319962e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.4210734e+05, 9.1391770e+04, 3.6616842e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.3187690e+05, 9.9814710e+04, 3.6286136e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.3461546e+05, 1.4719887e+05, 1.2771682e+05, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.3029813e+05, 1.4553006e+05, 3.2387668e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.2054252e+05, 1.4871895e+05, 3.1161329e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.2333488e+05, 1.0867917e+05,

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X.values,y,test_size=15,
                                                 random_state=10)

In [24]:
#X_test

In [25]:
#X_train

In [26]:
print (X_train.shape)
X_test.shape

(35, 6)


(15, 6)

In [27]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()

In [28]:
reg.fit(X_train,y_train)

LinearRegression()

In [29]:
y_pred = reg.predict(X_test)

In [30]:
r2_score(y_test,y_pred)

0.9796796125083208

In [31]:
from sklearn.datasets import load_boston

In [32]:
boston =  load_boston()

In [33]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [34]:
features = boston.data
target = boston.target

In [35]:
df =pd.DataFrame(features, columns =boston.feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [36]:
df['price'] =target

In [37]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [39]:
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=15,
                                                 random_state=10)

In [40]:
from sklearn.linear_model import LinearRegression

mlr = LinearRegression()

In [41]:
mlr.fit(X_train,y_train)

LinearRegression()

In [42]:
pred1 = mlr.predict(X_train)

In [43]:
r2_score(y_train,pred1)

0.7397138614314798

In [44]:
pred = mlr.predict(X_test)

In [45]:
r2_score(y_test,pred)

0.6723625224532928

In [46]:
from sklearn.preprocessing import PolynomialFeatures

In [47]:
poly = PolynomialFeatures(degree = 2)

In [48]:
poly_train = poly.fit_transform(X_train)

poly_test = poly.transform(X_test)

In [49]:
poly_train.shape

(491, 105)

In [50]:
mlr.fit(poly_train,y_train)

LinearRegression()

In [51]:
pred_test = mlr.predict(poly_test)

pred_train = mlr.predict(poly_train)

In [52]:
r2_score(y_test,pred_test)

0.8485805658742728

In [53]:
r2_score(y_train,pred_train)

0.928130543040002

In [54]:
from sklearn.metrics import mean_squared_error

In [55]:
mean_squared_error(y_train,pred_train)

6.006081016430486

In [56]:
mean_squared_error(y_test,pred_test)

12.852050864423548

In [57]:
#from sklearn.linear_model import Lasso,Ridge

In [58]:
## Feature Scaling 
from sklearn.preprocessing import StandardScaler

In [59]:
std = StandardScaler()

In [60]:
std_train = std.fit_transform(X_train)

std_test = std.transform(X_test)

In [61]:
poly = PolynomialFeatures()

poly_train = poly.fit_transform(std_train)

poly_test = poly.transform(std_test)

In [62]:
mlr.fit(poly_train,y_train)

LinearRegression()

In [63]:
pred_test = mlr.predict(poly_test)

pred_train = mlr.predict(poly_train)

In [64]:
r2_score(y_test,pred_test)

0.8509331396536

In [65]:
mlr.score(poly_test,y_test) #.score predicts and calculates r2 score

0.8509331396536

In [66]:
r2_score(y_train,pred_train)

0.9299392581637297

In [None]:
# Try min max algo as well