## Import Modules

In [1]:
import numpy as np
import time
from itertools import accumulate

## Gradient Descent Implementation

In [2]:
class SampleModel:
    """ class for sample model with parameters a,b """
    
    def __init__(self, a=0.5, b=0.5):
        """
        Initialize coefficient and bias

        Inputs:
        - a: coef
        - b: bias
        """
        self.params = {'coef':a, 'bias':b}
        self.losses = []
        self.times = []

    def data_generation(self, true_a, true_b):
        """
        Generate dataset
        """
        self.true_a = true_a
        self.true_b = true_b

        N = 100000

        X = np.random.randn(N)
        random_noise = np.random.randn(N) / 10

        X_modified = X + random_noise

        y = self.true_a*X_modified + self.true_b

        return X, y

    def data_generation_quad(self, true_a, true_b):
        """
        Generate dataset
        """
        self.true_a = true_a
        self.true_b = true_b

        N = 100000

        X = np.random.randn(N)
        random_noise = np.random.randn(N) / 10

        X_modified = X + random_noise

        y = np.power(self.true_a*X_modified + self.true_b, 2)

        return X, y

    def forward(self, X):
        """
        Implement forward pass for the model
        """
        a, b = self.params['coef'], self.params['bias']
        y = a * X + b
        return y

    def backward(self, X, y):
        """
        Computes the gradients for each param in self.params
        @param X: training input data (N,)
        @param y: training output data (N,)
        @return: gradients of parameters
        """
        a, b = self.params['coef'], self.params['bias']

        gradients = {}     ## Loss(L2) = avg((y - (a*X + b))^2)
        gradients['coef'] = np.mean(2*(y - (a*X+b))*(-X))
        gradients['bias']  = np.mean(2*(y - (a*X+b))*(-1))

        return gradients

    def run(self, X, y, train=False, n_epochs=10, lr=0.001, batch_size=None):
        """
        Runs the model with training as an option.
        @param X: training input data (N,)
        @param y: training output data (N,)
        @train: boolean for train
        @n_epochs: number of traninig epochs
        @lr: learning rate
        """
        print("Training Starts...")
        if train:
            if batch_size:
                print("Mini-Batch SGD w Batch Size: {}".format(batch_size))
                self.batch_gradient_descent(X, y, n_epochs=n_epochs, lr=lr, batch_size=batch_size)
            
            else:
                print("Full-Batch GD")
                self.gradient_descent(X, y, n_epochs=n_epochs, lr=lr)

    def gradient_descent(self, X, y, n_epochs=10, lr=0.001):
        """
        Train using batch gradient descent.
        @param X: training input data (N,)
        @param y: training output data (N,)
        @param lr: learning rate
        """
        for epoch in range(n_epochs):
            start = time.time()
            gradients = self.backward(X, y)
            for param in self.params:
                self.params[param] -= lr * gradients[param]

            current_loss = self.compute_loss(X, y)
            end = time.time()
            print(end - start)
            self.losses.append(current_loss)
            self.times.append(end - start)

            print("========== Epoch {}/{} ==========".format(epoch+1, n_epochs))
            print("Loss > {:.2f}".format(current_loss))
            print("Params > coef: {:.2f} / bias: {:.2f}".format(self.params['coef'], self.params['bias']))
        self.times = list(accumulate(self.times))

    def batch_gradient_descent(self, X, y, n_epochs=10, lr=0.001, batch_size=16):
        """
        Train using batch gradient descent.
        @param X: training input data (N,)
        @param y: training output data (N,)
        @param lr: learning rate
        @param batch_size
        """
        for epoch in range(n_epochs):
            # Prob-(a)
            ## TODO (Start) ##
            start = time.time()
            # Creating mini barches
            mini_batches = []
            data = np.hstack((X.reshape((-1, 1)), y.reshape((-1, 1))))
            np.random.shuffle(data)
            num_batches = data.shape[0] // batch_size

            for i in range(num_batches):
                batch = data[i*batch_size:(i+1)*batch_size, :]
                X_batch = batch[:, 0]
                y_batch = batch[:, 1]
                mini_batches.append((X_batch, y_batch))
            # Take the rest of the data
            if data.shape[0] % batch_size != 0:
                batch = data[batch_size*num_batches+1:, :]
                X_batch = batch[:, 0]
                y_batch = batch[:, 1]
                mini_batches.append((X_batch, y_batch))
            for batch in mini_batches:
                X_batch, y_batch = batch
                gradients = self.backward(X_batch, y_batch)
                
                for param in self.params:
                    self.params[param] -= lr * gradients[param]
            current_loss = self.compute_loss(X, y)
            end = time.time()
            self.losses.append(current_loss)
            self.times.append(end - start)
            ## TODO (End) ##

            print("========== Epoch {}/{} ==========".format(epoch+1, n_epochs))
            print("Loss > {:.2f}".format(current_loss))
            print("Params > coef: {:.2f} / bias: {:.2f}".format(self.params['coef'], self.params['bias']))
        self.times = list(accumulate(self.times))

    def compute_loss(self, X, y):
        """
        Computes the L2 loss of the model given X, y.
        @param X: training input data (N,)
        @param y: training output data (N,)
        @return: loss
        """
        a, b = self.params['coef'], self.params['bias']
        loss = np.mean(np.power(y - (a*X + b), 2))
        return loss
    

## Test Full-Batch GD

In [3]:
model = SampleModel(a=0.5, b=0.5)

In [4]:
X, y = model.data_generation(true_a=20, true_b=5)

## Use below code instead while doing Prob-(c)
# X, y = model.data_generation_quad(true_a=20, true_b=5)

In [5]:
n_epochs, lr = 25, 1e-1

start_time = time.time()

model.run(X, y, train=True, n_epochs=n_epochs, lr=lr)
loss = model.compute_loss(X, y)

end_time = time.time()

# Time spent to converge
for i in range(1, n_epochs):
    loss_difference_rate = np.abs((model.losses[i-1] - model.losses[i]) / model.losses[i-1] * 100)

    if loss_difference_rate < 1:
        print("Time spent for model loss to be converged : {:.5f}".format(model.times[i]))
        print("Total epoch : {}".format(i))
        break

print("Training Ends...")
print()
print("Trained with {} epochs, {} learning rate".format(n_epochs, lr))
print("Time Cost: {:.5f} sec".format(end_time-start_time))

Training Starts...
Full-Batch GD
0.006707429885864258
Loss > 260.32
Params > coef: 4.40 / bias: 1.41
0.005101203918457031
Loss > 167.96
Params > coef: 7.52 / bias: 2.14
0.004578590393066406
Loss > 108.88
Params > coef: 10.02 / bias: 2.72
0.00426793098449707
Loss > 71.09
Params > coef: 12.01 / bias: 3.18
0.004174709320068359
Loss > 46.91
Params > coef: 13.61 / bias: 3.55
0.004005908966064453
Loss > 31.45
Params > coef: 14.89 / bias: 3.84
0.003974199295043945
Loss > 21.56
Params > coef: 15.91 / bias: 4.08
0.0038754940032958984
Loss > 15.23
Params > coef: 16.73 / bias: 4.27
0.003906965255737305
Loss > 11.18
Params > coef: 17.38 / bias: 4.41
0.003918647766113281
Loss > 8.59
Params > coef: 17.90 / bias: 4.53
0.00426936149597168
Loss > 6.94
Params > coef: 18.32 / bias: 4.63
0.004709005355834961
Loss > 5.88
Params > coef: 18.66 / bias: 4.70
0.0037767887115478516
Loss > 5.20
Params > coef: 18.92 / bias: 4.76
0.0031785964965820312
Loss > 4.76
Params > coef: 19.14 / bias: 4.81
0.0032985210418701

## Test SGD

In [7]:
model = SampleModel(a=0.5, b=0.5)

In [8]:
X, y = model.data_generation(true_a=20, true_b=5)

## Use below code instead while doing Prob-(c)
# X, y = model.data_generation_quad(true_a=20, true_b=5)

In [9]:
n_epochs, lr, batch_size = 30, 1e-1, 1

start_time = time.time()

model.run(X, y, train=True, n_epochs=n_epochs, lr=lr, batch_size=batch_size)
loss = model.compute_loss(X, y)

end_time = time.time()

# Time spent to converge
for i in range(1, n_epochs):
    loss_difference_rate = np.abs((model.losses[i-1] - model.losses[i]) / model.losses[i-1] * 100)

    if loss_difference_rate < 1:
        print("Time spent for model loss to be converged : {:.5f}".format(model.times[i]))
        print("Total epoch : {}".format(i))
        break

print("Training Ends...")
print()
print("Trained with {} epochs, {} learning rate".format(n_epochs, lr))
print("Time Cost: {:.5f} sec".format(end_time-start_time))

Training Starts...
Mini-Batch SGD w Batch Size: 1
Loss > 4.87
Params > coef: 19.10 / bias: 5.25
Loss > 5.56
Params > coef: 18.75 / bias: 5.05
Loss > 4.30
Params > coef: 20.20 / bias: 5.51
Loss > 6.19
Params > coef: 21.26 / bias: 4.23
Loss > 4.12
Params > coef: 20.34 / bias: 5.10
Loss > 4.12
Params > coef: 20.07 / bias: 5.35
Loss > 4.82
Params > coef: 19.08 / bias: 4.99
Loss > 4.05
Params > coef: 19.76 / bias: 5.06
Loss > 4.58
Params > coef: 20.52 / bias: 5.55
Loss > 6.33
Params > coef: 20.23 / bias: 6.50
Loss > 4.47
Params > coef: 20.22 / bias: 4.33
Loss > 4.45
Params > coef: 20.40 / bias: 5.54
Loss > 5.61
Params > coef: 18.94 / bias: 5.70
Loss > 4.38
Params > coef: 19.43 / bias: 5.27
Loss > 5.05
Params > coef: 19.11 / bias: 5.51
Loss > 4.72
Params > coef: 20.60 / bias: 5.60
Loss > 4.47
Params > coef: 19.33 / bias: 4.79
Loss > 4.87
Params > coef: 20.84 / bias: 5.39
Loss > 6.38
Params > coef: 21.45 / bias: 5.51
Loss > 4.37
Params > coef: 20.36 / bias: 5.49
Loss > 4.17
Params > coef: 19.

## Test Mini-Batch SGD

In [13]:
model = SampleModel(a=0.5, b=0.5)

In [14]:
X, y = model.data_generation(true_a=20, true_b=5)

## Use below code instead while doing Prob-(c)
# X, y = model.data_generation_quad(true_a=20, true_b=5)

In [15]:
n_epochs, lr, batch_size = 15, 1e-1, 1000

start_time = time.time()

model.run(X, y, train=True, n_epochs=n_epochs, lr=lr, batch_size=batch_size)
loss = model.compute_loss(X, y)

end_time = time.time()

# Time spent to converge
for i in range(1, n_epochs):
    loss_difference_rate = np.abs((model.losses[i-1] - model.losses[i]) / model.losses[i-1] * 100)

    if loss_difference_rate < 1:
        print("Time spent for model loss to be converged : {:.5f}".format(model.times[i]))
        print("Total epoch : {}".format(i))
        break

print("Training Ends...")
print()
print("Trained with {} epochs, {} learning rate".format(n_epochs, lr))
print("Time Cost: {:.5f} sec".format(end_time-start_time))

Training Starts...
Mini-Batch SGD w Batch Size: 1000
Loss > 4.01
Params > coef: 20.04 / bias: 5.04
Loss > 4.01
Params > coef: 20.06 / bias: 5.04
Loss > 4.01
Params > coef: 20.00 / bias: 5.05
Loss > 4.01
Params > coef: 20.03 / bias: 4.98
Loss > 4.01
Params > coef: 20.00 / bias: 5.04
Loss > 4.01
Params > coef: 20.01 / bias: 5.03
Loss > 4.01
Params > coef: 20.01 / bias: 4.99
Loss > 4.01
Params > coef: 20.00 / bias: 5.04
Loss > 4.01
Params > coef: 20.04 / bias: 4.99
Loss > 4.01
Params > coef: 20.05 / bias: 5.03
Loss > 4.01
Params > coef: 20.00 / bias: 4.99
Loss > 4.01
Params > coef: 19.99 / bias: 5.04
Loss > 4.01
Params > coef: 19.98 / bias: 5.02
Loss > 4.01
Params > coef: 19.97 / bias: 5.00
Loss > 4.01
Params > coef: 20.00 / bias: 5.00
Time spent for model loss to be converged : 0.20367
Total epoch : 1
Training Ends...

Trained with 15 epochs, 0.1 learning rate
Time Cost: 1.40664 sec


      Write your answer to (b) in this cell.

      Write your answer to (c) in this cell.