# Assignment 4
## Group Members:
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
### Chukwudumebi Ubogu, e-mail: gusuboch@student.gu.se
***

### Task 1: A small linear regression example in PyTorch
***

In [65]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim

data = pd.read_csv('a4_synthetic.csv')

X = data.drop(columns='y').to_numpy()
Y = data.y.to_numpy()

(X, Y)

(array([[ 1.76405235,  0.40015721],
        [ 0.97873798,  2.2408932 ],
        [ 1.86755799, -0.97727788],
        [ 0.95008842, -0.15135721],
        [-0.10321885,  0.4105985 ],
        [ 0.14404357,  1.45427351],
        [ 0.76103773,  0.12167502],
        [ 0.44386323,  0.33367433],
        [ 1.49407907, -0.20515826],
        [ 0.3130677 , -0.85409574],
        [-2.55298982,  0.6536186 ],
        [ 0.8644362 , -0.74216502],
        [ 2.26975462, -1.45436567],
        [ 0.04575852, -0.18718385],
        [ 1.53277921,  1.46935877],
        [ 0.15494743,  0.37816252],
        [-0.88778575, -1.98079647],
        [-0.34791215,  0.15634897],
        [ 1.23029068,  1.20237985],
        [-0.38732682, -0.30230275],
        [-1.04855297, -1.42001794],
        [-1.70627019,  1.9507754 ],
        [-0.50965218, -0.4380743 ],
        [-1.25279536,  0.77749036],
        [-1.61389785, -0.21274028],
        [-0.89546656,  0.3869025 ],
        [-0.51080514, -1.18063218],
        [-0.02818223,  0.428

In [66]:
np.random.seed(1)
torch.manual_seed(1)

w_init = np.random.normal(size=(2, 1))
b_init = np.random.normal(size=(1, 1))

# We just declare the parameter tensors
w = torch.tensor(w_init, dtype=torch.float32, requires_grad=True)
b = torch.tensor(b_init, dtype=torch.float32, requires_grad=True)

eta = 1e-2
opt = optim.SGD([w, b], lr=eta)

for i in range(10):

    sum_err = 0

    for row in range(X.shape[0]):
        x = torch.tensor(X[row, :], dtype=torch.float32).view(1, -1)
        y = torch.tensor(Y[row], dtype=torch.float32).view(1, -1)

        # Forward pass.
        y_pred = x.mm(w) + b
        err = (y_pred - y).pow(2).sum()

        # Backward and update.
        opt.zero_grad()
        err.backward()
        opt.step()

        # For statistics.
        sum_err += err.item()

    mse = sum_err / X.shape[0]
    print(f'Epoch {i+1}: MSE =', mse)

Epoch 1: MSE = 0.7999662647869263
Epoch 2: MSE = 0.017392394159767264
Epoch 3: MSE = 0.009377418162580966
Epoch 4: MSE = 0.009355327616258364
Epoch 5: MSE = 0.009365440349979508
Epoch 6: MSE = 0.009366988411857164
Epoch 7: MSE = 0.009367207068114567
Epoch 8: MSE = 0.009367238481529512
Epoch 9: MSE = 0.009367244712136654
Epoch 10: MSE = 0.009367244620257224


### Task 2: Implementing tensor arithmetics for forward computations
***

In [67]:
class Tensor:

    # Constructor. Just store the input values.
    def __init__(self, data, requires_grad=False, grad_fn=None):
        self.data = data
        self.shape = data.shape
        self.grad_fn = grad_fn
        self.requires_grad = requires_grad
        self.grad = None

    # So that we can print the object or show it in a notebook cell.
    def __repr__(self):
        dstr = repr(self.data)
        if self.requires_grad:
            gstr = ', requires_grad=True'
        elif self.grad_fn is not None:
            gstr = f', grad_fn={self.grad_fn}'
        else:
            gstr = ''
        return f'Tensor({dstr}{gstr})'
    
    # Reshape tensor values
    def reshape(self, *shape):
        new_data = self.data.reshape(shape)
        return Tensor(new_data, self.requires_grad, self.grad_fn)

    # Extract one numerical value from this tensor.
    def item(self):
        return self.data.item()

    # Operator +
    def __add__(self, right):
        # Call the helper function defined below.
        return addition(self, right)

    # Operator -
    def __sub__(self, right):
        return substraction(self, right)

    # Operator @
    def __matmul__(self, right):
        return matmul(self, right)
    
    # Operator **
    def __pow__(self, right):
        # NOTE! We are assuming that right is an integer here, not a Tensor!
        if not isinstance(right, int):
            raise Exception('only integers allowed')
        if right < 2:
            raise Exception('power must be >= 2')
        return power(self, right)

    # Backward computations
    def backward(self, grad_output=None):
        if self.requires_grad:
            # Initialize grad_output with ones if this is the loss tensor
            if grad_output is None:
                grad_output = np.ones_like(self.data)
                
            # Collect the gradient in grad attribute
            if self.grad is None:
                self.grad = grad_output
            else:
                self.grad += grad_output
            
            # If there's a grad_fn call its backward method
            if self.grad_fn is not None:
                # Iterate over the inputs to the operation that produced this tensor
                for input_tensor in self.grad_fn.inputs:
                    if input_tensor.requires_grad:
                        grad_wrt_input = grad_output
                        input_tensor.backward(grad_wrt_input)

In [68]:
# A small utility where we simply create a Tensor object. We use this to
# mimic torch.tensor.
def tensor(data, requires_grad=False):
    return Tensor(data, requires_grad)

# We define helper functions to implement the various arithmetic operations.

# This function takes two tensors as input, and returns a new tensor holding
# the result of an element-wise addition on the two input tensors.
def addition(left, right):
    new_data = left.data + right.data
    grad_fn = None
    return Tensor(new_data, grad_fn=grad_fn)

def substraction(left, right):
    new_data = left.data - right.data
    grad_fn = None
    return Tensor(new_data, grad_fn=grad_fn)

def matmul(left, right):
    new_data = left.data.dot(right.data)
    grad_fn = None
    return Tensor(new_data, grad_fn=grad_fn)

def power(left, exponent):
    new_data = left.data ** exponent
    grad_fn = None
    return Tensor(new_data, grad_fn=grad_fn)

In [69]:
# Sanity Check
# Create some tensors
a = tensor(np.array([1, 2, 3]), requires_grad=True)
b = tensor(np.array([4, 5, 6]), requires_grad=True)
c = tensor(np.array([[1, 2], [3, 4]]), requires_grad=True)
d = tensor(np.array([[2], [3]]), requires_grad=True)

sum_result = a + b
sub_result = b - a
matmul_result = c @ d
power_result = c ** 2

# Print results
print("Addition Result:", sum_result)
print("Subtraction Result:", sub_result)
print("Matrix Multiplication Result:", matmul_result)
print("Power Result:", power_result)

Addition Result: Tensor(array([5, 7, 9]))
Subtraction Result: Tensor(array([3, 3, 3]))
Matrix Multiplication Result: Tensor(array([[ 8],
       [18]]))
Power Result: Tensor(array([[ 1,  4],
       [ 9, 16]]))


### Task 3: Building the computational graph
***

In [70]:
class Node:
    def __init__(self, *inputs):
        self.inputs = inputs

    def backward(self, grad_output):
        raise NotImplementedError('Unimplemented')

    def __repr__(self):
        return f"{self.__class__.__name__}(id={id(self)})"

class AdditionNode(Node):
    def __init__(self, left, right):
        super().__init__(left, right)
        self.left = left
        self.right = right
    
    def backward(self, grad_output):
        # Placeholder for backward logic
        self.inputs[0].grad += 1 * self.inputs.grad_fn
        self.inputs[1].grad += 1 * self.inputs.grad_fn 
        

class SubtractionNode(Node):
    def __init__(self, left, right):
        super().__init__(left, right)
        self.left = left
        self.right = right
    
    def backward(self, grad_output):
        # Placeholder for backward logic
        raise NotImplementedError('backward method for SubtractionNode not implemented')
    
class MatMulNode(Node):
    def __init__(self, left, right):
        super().__init__(left, right)
        self.left = left
        self.right = right

    def backward(self, grad_output):
        # Placeholder for backward logic
        raise NotImplementedError('backward method for MatMulNode not implemented')
    
class PowerNode(Node):
    def __init__(self, base, exponent):
        super().__init__(base)
        self.base = base
        self.exponent = exponent

    def backward(self, grad_output):
        # Placeholder for backward logic
        raise NotImplementedError('backward method for PowerNode not implemented')

In [71]:
# Create the tensor object
def tensor(data, requires_grad=False):
    return Tensor(data, requires_grad)

# Update the helper functions to implement the various arithmetic operations.
def addition(left, right):
    new_data = left.data + right.data
    return Tensor(new_data, requires_grad=True, grad_fn=AdditionNode(left, right))

def substraction(left, right):
    new_data = left.data - right.data
    return Tensor(new_data, requires_grad=True, grad_fn=SubtractionNode(left, right))

def matmul(left, right):
    new_data = left.data.dot(right.data)
    return Tensor(new_data, requires_grad=True, grad_fn=MatMulNode(left, right))

def power(left, exponent):
    new_data = left.data ** exponent
    return Tensor(new_data, requires_grad=True, grad_fn=PowerNode(left, exponent))

In [72]:
x = tensor(np.array([[2.0, 3.0]]))
w1 = tensor(np.array([[1.0, 4.0]]), requires_grad=True)
w2 = tensor(np.array([[3.0, -1.0]]), requires_grad=True)

test_graph = x + w1 + w2

print('Computational graph top node after x + w1 + w2:', test_graph.grad_fn)

assert(isinstance(test_graph.grad_fn, AdditionNode))
assert(test_graph.grad_fn.right is w2)
assert(test_graph.grad_fn.left.grad_fn.left is x)
assert(test_graph.grad_fn.left.grad_fn.right is w1)

Result of addition (c): Tensor(array([4., 4., 4.]), requires_grad=True)
Result of subtraction (d): Tensor(array([-2.,  0.,  2.]), requires_grad=True)
Result of matrix multiplication (e): Tensor(array([0.]), requires_grad=True)
Result of power operation (f): Tensor(array([0.]), requires_grad=True)


### Task 4: Implementing the backward computations
***

In [73]:
class Node:
    def __init__(self, *inputs):
        self.inputs = inputs

    def backward(self, grad_output):
        raise NotImplementedError('Unimplemented')

    def __repr__(self):
        return f"{self.__class__.__name__}(id={id(self)})"

class AdditionNode(Node):
    def __init__(self, left, right):
        super().__init__(left, right)
        self.left = left
        self.right = right
    
    def backward(self, grad_output):
        if self.inputs[0].requires_grad:
            self.inputs[0].backward(grad_output)
        if self.inputs[1].requires_grad:
            self.inputs[1].backward(grad_output)
        

class SubtractionNode(Node):
    def __init__(self, left, right):
        super().__init__(left, right)
        self.left = left
        self.right = right
    
    def backward(self, grad_output):
        if self.inputs[0].requires_grad:
            self.inputs[0].backward(grad_output)
        if self.inputs[1].requires_grad:
            self.inputs[1].backward(grad_output)
    
class MatMulNode(Node):
    def __init__(self, left, right):
        super().__init__(left, right)
        self.left = left
        self.right = right

    def backward(self, grad_output):
        if self.inputs[0].requires_grad:
            grad_wrt_left = grad_output * self.inputs[1].data
            self.inputs[0].backward(grad_wrt_left)
        if self.inputs[1].requires_grad:
            grad_wrt_right = grad_output * self.inputs[0].data
            self.inputs[1].backward(grad_wrt_right)
    
class PowerNode(Node):
    def __init__(self, base, exponent):
        super().__init__(base)
        self.base = base
        self.exponent = exponent

    def backward(self, grad_output):
        base = self.inputs[0]
        if base.requires_grad:
            grad_wrt_base = grad_output * self.exponent * base.data ** (self.exponent - 1)
            base.backward(grad_wrt_base)

In [74]:
# Create input tensors
a = tensor(np.array([2.0]), requires_grad=True)
b = tensor(np.array([3.0]), requires_grad=True)

# Perform operations
c = a + b
d = c - b
e = c @ d
f = e ** 2

# Initiate backwards propogation
f.backward()

# Print the results
print("Gradient of a:", a.grad)
print("Gradient of b:", b.grad)

Gradient of a: 16.0
Gradient of b: 16.0


### Task 5: Optimizers to update the model parameters
***

In [None]:
class SGDOptimizer:
    def __init__(self, parameters, lr=0.01):
        self.parameters = parameters
        self.lr = lr

    def step(self):
        # Update each parameter
        for param in self.parameters:
            if param.grad is not None:
                param.data -= self.lr * param.grad

    def zero_grad(self):
        # Reset gradients for each parameter
        for param in self.parameters:
            if param.grad is not None:
                param.grad = np.zeros_like(param.grad)

### Questions and answers

- Q1: Do we have to implement gradient accumulation?
- Q2: Am I allowed to implement <some function>?
- Q3: In Task 5, what do you mean by "identical"? 