# Multiple linear regression from scratch using python

In [1]:
# Importing libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
# Loading the data

data = pd.read_csv("/content/drive/MyDrive/#100DaysofMLcode/50_Startups.csv")

In [3]:
data.head()

# The data has 4 features - 1 is categorical and the other 3 are discrete

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [11]:
# Separating the values

X = data.iloc[:,:4].values
y = data.iloc[:,-1].values

In [12]:
# Handling categorical variables 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# creating one hot encoder object with categorical feature 0
# indicating the first column
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [13]:
# Scaling the features in order to get a uniformly distributed contour. 
# If the contour is narrow and tall, it will take a lot of time to converge to the minimum.

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:, 3:] = sc.fit_transform(X[:, 3:])

In [18]:
# The cost function used to measure the error. 

def cost_function(X, Y, B):
 m = len(Y)
 J = np.sum((X.dot(B) - Y) ** 2)/(2 * m)
 return J

In [20]:
# This is the gradient descent.

def batch_gradient_descent(X, Y, B, alpha, iterations):
 cost_history = [0] * iterations
 m = len(Y)
 
 for iteration in range(iterations):
 # print(iteration)
 # Hypothesis Values
  h = X.dot(B)
 # Difference b/w Hypothesis and Actual Y
  loss = h - Y
 # Gradient Calculation
  gradient = X.T.dot(loss) / m
 # Changing Values of B using Gradient
  B = B - alpha * gradient
 # New Cost Value
  cost = cost_function(X, Y, B)
  cost_history[iteration] = cost
 
 return B, cost_history

In [21]:
# Splitting the dataset into training and testing sets 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [25]:
# Initial Coefficients
B = np.zeros(X_train.shape[1])

alpha = 0.005
iter_ = 2000

newB, cost_history = batch_gradient_descent(X_train, y_train, B, alpha, iter_)

In [26]:
# We can see that the cost has reduced with each iteration.
print(cost_history)

[7219721313.798002, 7183363437.6618395, 7147314872.385083, 7111571618.201815, 7076129736.859079, 7040985350.61951, 7006134641.280393, 6971573849.208883, 6937299272.39313, 6903307265.50902, 6869594239.002291, 6836156658.1858, 6802991042.351602, 6770093963.897703, 6737462047.469155, 6705091969.113309, 6672980455.4489765, 6641124282.849259, 6609520276.637823, 6578165310.29843, 6547056304.697428, 6516190227.319071, 6485564091.513415, 6455174955.756569, 6425019922.923127, 6395096139.570566, 6365400795.235395, 6335931121.740899, 6306684392.516256, 6277657921.926832, 6248849064.615507, 6220255214.854805, 6191873805.909681, 6163702309.41078, 6135738234.737983, 6107979128.414084, 6080422573.508419, 6053066189.050308, 6025907629.452089, 5998944583.941675, 5972174776.004383, 5945595962.83394, 5919205934.792509, 5893002514.87956, 5866983558.209451, 5841146951.497605, 5815490612.555078, 5790012489.791449, 5764710561.725836, 5739582836.505949, 5714627351.435005, 5689842172.506426, 5665225393.946119,

In [27]:
# Prediction

y_ = X_test.dot(newB)

In [31]:
# Predicted values.

print(y_)

[103794.28860632662 77589.75357515643 72597.26645687448 55631.807362421634
 176404.42178623352 162238.61497449587 34787.957536775735
 93621.60720350704 52041.053630459224 89560.7908963214]


In [32]:
# Actual values

print(y_test)

[105008.31  96479.51  78239.91  81229.06 191050.39 182901.99  35673.41
 101004.64  49490.75  97483.56]


In [28]:
# Calculating r2 score

def r2(y_,y):
 sst = np.sum((y-y.mean())**2)
 ssr = np.sum((y_-y)**2)
 r2 = 1-(ssr/sst)
 return(r2)

In [30]:
r2(y_,y_test)

# r2 score is 0.92 which is good. 

0.9201611139804405