# Multiple Linear Regression

## Importing the libraries

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [29]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


## Encoding categorical data

In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))


## Splitting the dataset into the Training set and Test set

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [32]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [33]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


In [34]:
y_bar = y_test.mean()
y_bar

122277.407

In [35]:
SST = sum((y_test- y_bar)**2)
SSE = sum((y_test-y_pred)**2)
SSR = sum((y_pred - y_bar)**2)

In [36]:
r2 = 1 - (SSE/SST)
mse = SSE/len(y_test)
y_test = np.array(y_test)
accuracy = 1 -(mse/y_test.var())
print("value R-Squared error is: ",r2)
print("value accuracy is: ",accuracy)

value R-Squared error is:  0.9347068473282545
value accuracy is:  0.9347068473282546


In [37]:
## standadization

In [38]:
columns_ls = dataset.columns[:-2]

In [39]:
columns_ls

Index(['R&D Spend', 'Administration', 'Marketing Spend'], dtype='object')

In [40]:
for column in columns_ls:
  mn = dataset[column].min()
  mx = dataset[column].max()
  dataset[column] = (dataset[column] - mn)/(mx - mn)

X = np.array(dataset.iloc[:, :-1].values)
y = np.array(dataset.iloc[:, -1].values)

In [41]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, 1.0, 0.6517439310268492, 1.0],
       [1.0, 0.0, 0.0, 0.9833594598582878, 0.7619717267693455,
        0.9408933662664767],
       [0.0, 1.0, 0.0, 0.9279845926076449, 0.3795789541636033,
        0.864663603542383],
       [0.0, 0.0, 1.0, 0.8731364288427158, 0.5129983902549908,
        0.8122351304336031],
       [0.0, 1.0, 0.0, 0.8594377233152624, 0.30532803826238897,
        0.776135567095203],
       [0.0, 0.0, 1.0, 0.7975659997145434, 0.36944789841721865,
        0.7691258777055013],
       [1.0, 0.0, 0.0, 0.8141282812375263, 0.7301611069589005,
        0.27071031007615565],
       [0.0, 1.0, 0.0, 0.7880179039269618, 0.7174572453826598,
        0.6864934193416014],
       [0.0, 0.0, 1.0, 0.7290178603827535, 0.7417327573593728,
        0.6604997709757493],
       [1.0, 0.0, 0.0, 0.7459055139063268, 0.43692884159716305,
        0.6464431929774658],
       [0.0, 1.0, 0.0, 0.616350608288398, 0.45150637450193143,
        0.48573266882033544],
       [1.0, 0.0, 0.0, 

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [43]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [44]:
y_pred = regressor.predict(X_test)

In [45]:
y_bar = y_test.mean()
SST = sum((y_test- y_bar)**2)
SSE = sum((y_test-y_pred)**2)
SSR = sum((y_pred - y_bar)**2)

In [46]:

r2 = 1 - (SSE/SST)
mse = SSE/len(y_test)
y_test = np.array(y_test)
accuracy = 1 -(mse/y_test.var())
print("value R-Squared error after standadization is: ",r2)
print("value accuracy after standadization is: ",accuracy)


value R-Squared error after standadization is:  0.9347068473282426
value accuracy after standadization is:  0.9347068473282427
