# Multiple Linear Regression

## Import the Library

In [65]:
# Import the library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import the Dataset

In [19]:
# import dataset
data = pd.read_csv("50_Startups.csv")
data.head(10)   # print the first 10 records.

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [20]:
data.shape # print shape.

(50, 5)

In [21]:
data.columns  # Print columns.

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [22]:
data.describe()   # Describe the dataset.

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [23]:
data.isnull()   # Check the null values.

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [24]:
data.isnull().sum()   # Print sum of null values.

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

### Formula:  y = b0 + b1*x1 + b2*x2 + b3*x3.............bnxn. 

# Spliting the data into traing and testing

In [39]:
# Spliting the dataset into traing and test
x = data.iloc[:,:-1].values
y = data.iloc[:,4].values

In [40]:
x

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [41]:
y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

In [42]:
x.shape

(50, 4)

In [43]:
y.shape

(50,)

# Encoding categorical Varible

In [44]:
# Encoding the independent variable.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()

In [45]:
x[:,3] = labelencoder_x.fit_transform(x[:,3])

In [46]:
onehotencoder = OneHotEncoder(categorical_features = [3])
x = onehotencoder.fit_transform(x).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [47]:
x

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

# Avoiding the Dummy Variable trap

In [64]:
x = x[:, 1:]
x

array([[1.0000000e+00, 1.6534920e+05, 1.3689780e+05, 4.7178410e+05],
       [0.0000000e+00, 1.6259770e+05, 1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.5344151e+05, 1.0114555e+05, 4.0793454e+05],
       [1.0000000e+00, 1.4437241e+05, 1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.4210734e+05, 9.1391770e+04, 3.6616842e+05],
       [1.0000000e+00, 1.3187690e+05, 9.9814710e+04, 3.6286136e+05],
       [0.0000000e+00, 1.3461546e+05, 1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.3029813e+05, 1.4553006e+05, 3.2387668e+05],
       [1.0000000e+00, 1.2054252e+05, 1.4871895e+05, 3.1161329e+05],
       [0.0000000e+00, 1.2333488e+05, 1.0867917e+05, 3.0498162e+05],
       [0.0000000e+00, 1.0191308e+05, 1.1059411e+05, 2.2916095e+05],
       [0.0000000e+00, 1.0067196e+05, 9.1790610e+04, 2.4974455e+05],
       [0.0000000e+00, 9.3863750e+04, 1.2732038e+05, 2.4983944e+05],
       [0.0000000e+00, 9.1992390e+04, 1.3549507e+05, 2.5266493e+05],
       [0.0000000e+00, 1.1994324e+

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2,random_state=0)

In [53]:
print(X_train.shape)

(40, 5)

In [55]:
print(y_train.shape)

(40,)

In [56]:
print(X_test.shape)

(10, 5)

In [57]:
print(y_test.shape)

(10,)

# Fitting multiple Linear Regression to the Training set

In [63]:
from sklearn.linear_model import LinearRegression
regressior = LinearRegression()
regressior.fit(X_train, y_train)
y_pred = regressior.predict(X_test)
y_pred

array([103015.20159796, 132582.27760815, 132447.73845175,  71976.09851258,
       178537.48221056, 116161.24230166,  67851.69209676,  98791.73374687,
       113969.43533013, 167921.06569551])

# Good Job!