In [1]:
import numpy as np
import random as Random
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_excel(r"E:\My work\Sem 7\ML\ML-Class\End Sem\End sem Data.xlsx")
scaler = StandardScaler()
data = scaler.fit_transform(df)

In [3]:
np.random.shuffle(data)
Y = data[:,-1]
X = np.delete(data,-1,1)
X = X.astype(np.float64)
Y = Y.astype(np.float64)
#Separating into train/test (80-20)
sep = 0.8*len(X)-1
sep = int(sep)
trainx, testx = X[:sep,:], X[sep:,:]
trainy, testy = Y[:sep], Y[sep:]

In [4]:
# GD from scratch
def GD_Batch(x,y, epochs = 10**4, α = 0.001):
    w0 = np.ones((len(x),1))
    x = np.hstack((w0,x))
    wc = np.array([0]*len(x[0]))
    n = len(x)
    for epoch in range(epochs):
        yp = np.dot(x,wc)

        #Reshaping y to allow for multiplication
        yp = yp.reshape(len(yp),1)
        y = y.reshape(len(y),1)
        ws = (x*(y-yp)).sum(axis=0)
        wd = -(2/n)*ws
        wc = wc-(α*wd)
        #print("m {}, b {}, iteration {}".format(mc,bc,i))
    return(wc)

# GD with L1 regularisation
def GD_BatchL1(x,y, epochs = 10**4, α = 0.001, λ = 0.1):
    w0 = np.ones((len(x),1))
    x = np.hstack((w0,x))
    wc = np.array([0.5]*len(x[0]))
    n = len(x)
    for epoch in range(epochs):
        yp = np.dot(x,wc)

        #Reshaping y to allow for multiplication
        yp = yp.reshape(len(yp),1)
        y = y.reshape(len(y),1)
        ws = (x*(y-yp)).sum(axis=0)
        wd = -(2/n)*ws
        ld = λ*np.sign(wc)
        wc = wc-α*(wd+ld)
        #print("m {}, b {}, iteration {}".format(mc,bc,i))
    return(wc)

# GD with L2 regularisation
def GD_BatchL2(x,y, epochs = 10**4, α = 0.001, λ = 0.1):
    w0 = np.ones((len(x),1))
    x = np.hstack((w0,x))
    wc = np.array([0]*len(x[0]))
    n = len(x)
    for epoch in range(epochs):
        yp = np.dot(x,wc)

        #Reshaping y to allow for multiplication
        yp = yp.reshape(len(yp),1)
        y = y.reshape(len(y),1)
        ws = (x*(y-yp)).sum(axis=0)
        wd = -(2/n)*ws
        ld = 2*λ*wc
        wc = wc-α*(wd+ld)
        #print("m {}, b {}, iteration {}".format(mc,bc,i))
    return(wc)

#SGD only works for 5 feature vector data
def SGD(x,y, iter = 10**4, α = 0.001):
    y = y.tolist()
    wc1 = wc2 = wc3 = wc4 = wc5 = bc = 0
    n = len(x)
    for i in range(iter):
        ya = np.random.choice(y)
        xa1 = x[:,0]
        xa1 = xa1[y.index(ya)]
        xa2 = x[:,1]
        xa2 = xa2[y.index(ya)]
        xa3 = x[:,2]
        xa3 = xa3[y.index(ya)]
        xa4 = x[:,3]
        xa4 = xa4[y.index(ya)]
        xa5 = x[:,4]
        xa5 = xa5[y.index(ya)]

        yp = (wc1*xa1)+(wc2*xa2)+(wc3*xa3)+(wc4*xa4)+(wc5*xa5)+bc

        wd1 = -2*xa1*(ya-yp)
        wd2 = -2*xa2*(ya-yp)
        wd3 = -2*xa3*(ya-yp)
        wd4 = -2*xa4*(ya-yp)
        wd5 = -2*xa5*(ya-yp)
        bd = -2*(ya-yp)
        wc1 = wc1-(α*wd1)
        wc2 = wc2-(α*wd2)
        wc3 = wc3-(α*wd3)
        wc4 = wc4-(α*wd4)
        wc5 = wc5-(α*wd5)
        bc = bc-(α*bd)
    wc = [bc,wc1,wc2,wc3,wc4,wc5]
    return(wc)
    
    
    
# Mini Batch GD (mini batch size 20) only works for 5 feature vector data
def minBatchGD(x,y,iter = 10**4,α = 0.001,bs = 3):
    y = y.tolist()
    wc1 = wc2 = wc3 = wc4 = wc5 = bc = 0
    n = len(x)
    for i in range(iter):
        ya = Random.sample(y,k = bs)
        xa1 = np.zeros(bs)
        xa2 = np.zeros(bs)
        xa3 = np.zeros(bs)
        xa4 = np.zeros(bs)
        xa5 = np.zeros(bs)

        xas1 = x[:,0]
        xas2 = x[:,1]
        xas3 = x[:,2]
        xas4 = x[:,3]
        xas5 = x[:,4]
        for i in range(bs):
            xa1[i] = xas1[y.index(ya[i])]
            xa2[i] = xas2[y.index(ya[i])]
            xa3[i] = xas3[y.index(ya[i])]
            xa4[i] = xas4[y.index(ya[i])]
            xa5[i] = xas5[y.index(ya[i])]
            
        yp = (wc1*xa1)+(wc2*xa2)+(wc3*xa3)+(wc4*xa4)+(wc5*xa5)+bc

        wd1 = -(2/bs)*sum(xa1*(ya-yp))
        wd2 = -(2/bs)*sum(xa2*(ya-yp))
        wd3 = -(2/bs)*sum(xa3*(ya-yp))
        wd4 = -(2/bs)*sum(xa4*(ya-yp))
        wd5 = -(2/bs)*sum(xa5*(ya-yp))
        bd = -(2/bs)*sum(ya-yp)

        wc1 = wc1-(α*wd1)
        wc2 = wc2-(α*wd2)
        wc3 = wc3-(α*wd3)
        wc4 = wc4-(α*wd4)
        wc5 = wc5-(α*wd5)
        bc = bc-(α*bd)
    
    wc = [bc,wc1,wc2,wc3,wc4,wc5]
    return(wc)

# Predicts the values based on weights
def predict(x,w):
    x0 = np.ones((len(x),1))
    x = np.hstack((x0,x))
    pred = np.dot(x,w)
    return(pred)


In [5]:
# Regularisation is adding λ*L
# L1 means L is the L1 norm of w
# L2 means L is the L2 norm of w
# So, the loss function changes.
# The derivative for L1 becomes (sign of w)*λ
# The derivative for L2 becomes 2*λ*w

# When there are very high powers in the fiting function of the model, it has low bias but high variance (overfiting).
# And when it has only lower powers (say linear regression), it has low vaiance but high bias (underfit).
# Regularisation can help decrease variance by making the higher powers have small weights. Thus keeping it low bias and low variance.
# L2 reduces the higher power weights faster than L1

In [6]:
# Applying linear regression using sklearn (for comparison)
model = LinearRegression()
res = model.fit(trainx, trainy)
predy = res.predict(testx)

# Linear Regression using my code (epochs = 10**4, α = 0.001, λ = 0.1)
w1 = GD_Batch(trainx,trainy)
w2 = GD_BatchL1(trainx,trainy)
w3 = GD_BatchL2(trainx,trainy)

predy1 = predict(testx,w1)
predy2 = predict(testx,w2)
predy3 = predict(testx,w3)

In [7]:
# The coefficients
print("Coefficients: \n", model.intercept_, model.coef_)
print(w1)
print(w2)
print(w3)
# The mean squared error
print("Mean squared error (sklearn): %.2f" % mean_squared_error(testy, predy))
print("Mean squared error (Normal): %.2f" % mean_squared_error(testy, predy1))
print("Mean squared error (L1): %.2f" % mean_squared_error(testy, predy2))
print("Mean squared error (L2): %.2f" % mean_squared_error(testy, predy3))
print()
# The coefficient of determination: 1 is perfect prediction
print("R2 score (sklearn): %.2f" % r2_score(testy, predy))
print("R2 score (normal): %.2f" % r2_score(testy, predy1))
print("R2 score (L1): %.2f" % r2_score(testy, predy2))
print("R2 score (L2): %.2f" % r2_score(testy, predy3))

Coefficients: 
 0.004362905400451141 [-0.59726041 -0.3591826  -0.49555989  0.23091207 -0.29541439]
[ 0.00431022 -0.59629056 -0.35187967 -0.49269516  0.23013016 -0.30107666]
[-7.29093744e-05 -5.00738371e-01 -1.93250527e-01 -3.68320927e-01
  1.57074185e-01 -3.19465309e-01]
[ 0.00090742 -0.51767305 -0.257408   -0.40577275  0.1942415  -0.3026978 ]
Mean squared error (sklearn): 0.47
Mean squared error (Normal): 0.47
Mean squared error (L1): 0.47
Mean squared error (L2): 0.47

R2 score (sklearn): 0.43
R2 score (normal): 0.43
R2 score (L1): 0.43
R2 score (L2): 0.44


In [8]:
# Varying the hyperparameters
w4 = GD_BatchL1(trainx,trainy, λ = 1)
w5 = GD_BatchL1(trainx,trainy, λ = 0.98)
w6 = GD_BatchL1(trainx,trainy, λ = 0.5)

w7 = GD_BatchL2(trainx,trainy, λ = 1)
w8 = GD_BatchL2(trainx,trainy, λ = 0.98)
w9 = GD_BatchL2(trainx,trainy, λ = 0.5)

predy4 = predict(testx,w4)
predy5 = predict(testx,w5)
predy6 = predict(testx,w6)

predy7 = predict(testx,w7)
predy8 = predict(testx,w8)
predy9 = predict(testx,w9)

In [9]:
# The mean squared error
print("Mean squared error (L1, 0.1): %.2f" % mean_squared_error(testy, predy2))
print("Mean squared error (L1, 1): %.2f" % mean_squared_error(testy, predy4))
print("Mean squared error (L1, 0.98): %.2f" % mean_squared_error(testy, predy5))
print("Mean squared error (L1, 0.5): %.2f" % mean_squared_error(testy, predy6))
print()
print("Mean squared error (L2, 0.1): %.2f" % mean_squared_error(testy, predy3))
print("Mean squared error (L2, 1): %.2f" % mean_squared_error(testy, predy7))
print("Mean squared error (L2, 0.98): %.2f" % mean_squared_error(testy, predy8))
print("Mean squared error (L2, 0.5): %.2f" % mean_squared_error(testy, predy9))
print()
# The coefficient of determination: 1 is perfect prediction
print("R2 score (L1, 0.1): %.2f" % r2_score(testy, predy2))
print("R2 score (L1, 1): %.2f" % r2_score(testy, predy4))
print("R2 score (L1, 0.98): %.2f" % r2_score(testy, predy5))
print("R2 score (L1, 0.5): %.2f" % r2_score(testy, predy6))
print()
print("R2 score (L2, 0.1): %.2f" % r2_score(testy, predy3))
print("R2 score (L2, 1): %.2f" % r2_score(testy, predy7))
print("R2 score (L2, 0.98): %.2f" % r2_score(testy, predy8))
print("R2 score (L2, 0.5): %.2f" % r2_score(testy, predy9))

Mean squared error (L1, 0.1): 0.47
Mean squared error (L1, 1): 0.84
Mean squared error (L1, 0.98): 0.84
Mean squared error (L1, 0.5): 0.69

Mean squared error (L2, 0.1): 0.47
Mean squared error (L2, 1): 0.57
Mean squared error (L2, 0.98): 0.57
Mean squared error (L2, 0.5): 0.51

R2 score (L1, 0.1): 0.43
R2 score (L1, 1): -0.01
R2 score (L1, 0.98): -0.01
R2 score (L1, 0.5): 0.17

R2 score (L2, 0.1): 0.44
R2 score (L2, 1): 0.31
R2 score (L2, 0.98): 0.31
R2 score (L2, 0.5): 0.38
