In [51]:
import numpy as np
import torch

In [52]:
class LinearClassifier(object):
  def __init__(self, device='cpu'):
    self.W = None # classifier weights
    self.dv = device
  def save_weights(self, path):
    torch.save(self.W, path)
  def load_weights(self, path):
    self.W = torch.load(path)
  def sigmoid(self, score):
    prob = torch.zeros_like(score)
    ####################
    # TODO:
    # 1. Implement sigmoid function on `score` and store in variable `prob`
    ####################
    # -----START OF YOUR CODE-----
    prob = 1 / (1 + np.exp(-score))

    # ------END OF YOUR CODE------
    return prob
  def forward(self, x):
    num_data, data_dim = x.shape
    if self.W is None:
      np.random.seed(0)
      self.W = torch.from_numpy(np.random.randn(data_dim)*1e-4).to(self.dv)
    x = x.to(self.dv)
    pred_y = torch.zeros(num_data).to(self.dv)
    ####################
    # TODO:
    # 1. Implement linear classifier f(x) = W * x, and then transform the predicted values to probabilities by sigmoid function
    # 2. Store probabilities in variable `pred_y`
    ####################
    # -----START OF YOUR CODE----- *
    pred_y = torch.matmul(self.W, x.transpose(0,1))
    pred_y = self.sigmoid(pred_y)
    
    # ------END OF YOUR CODE------
    self.cache = (x, pred_y)
    return pred_y
  def backward(self, dL):
    dL = dL.to(self.dv)
    dW = torch.zeros(1).to(self.dv)
    ####################
    # TODO:
    # 1. Derive the gradients of weights, calculate their average and store it in variable `dW`
    #   HINT 1: Chain rule d(loss)/d(weight) = d(loss)/d(pred_y) * d(pred_y)/d(weight)
    #   HINT 2: Use self.cache (batch flatten data x that multiplied weights in forward process, preditcted output that calculated by weights and input)
    ####################
    # -----START OF YOUR CODE-----
    dSigmoid = self.cache[1] * (1 - self.cache[1])
    dz = self.cache[0]
    dw = torch.matmul((dL * dSigmoid).reshape(1, -1), self.cache[0]) / self.cache[1].shape[0]
    # dw = torch.matmul(a, self.cache[0])
    
    # ------END OF YOUR CODE------
    return dW

class BCEloss(object):
  def __init__(self, device='cpu'):
    self.dv = device
  def __call__(self, y, pred_y):
    y = y.to(self.dv)
    pred_y = pred_y.to(self.dv)
    L = torch.zeros_like(y).to(self.dv)
    dL = torch.zeros_like(y).to(self.dv)
    ####################
    # TODO:
    # 1. Implement binary cross-entropy loss and store the results of each data in variable `L`
    # 2. Derive the gradient of binary cross-entropy loss and store the results of each data in variable `dL`
    ####################
    # -----START OF YOUR CODE-----
    L = -(y * np.log(pred_y) + (1-y) * np.log(1 - pred_y))

    dL = (pred_y - y) / pred_y * (1 - pred_y)
    # ------END OF YOUR CODE------
    return L, dL

class Optimizer(object):
  def __init__(self, model, learning_rate):
    self.model = model
    self.lr = learning_rate
  def step(self, dW):
    dW = dW.to(model.dv)
    new_weights = self.model.W.clone()
    ####################
    # TODO:
    # 1. Update model weights by gradient descent and store it in variable `new_weights`
    ####################
    # -----START OF YOUR CODE-----
    new_weights = new_weights - self.lr * dW
    
    # ------END OF YOUR CODE------
    self.model.W = new_weights

In [53]:
def rel_err(a, b):
    return torch.max(torch.abs(a - b) / (torch.maximum(torch.tensor(1e-8), torch.abs(a) + torch.abs(b))))

DEVICE='cpu'
model = LinearClassifier(DEVICE)
# Model_Tests(model)
x = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.float64).reshape(3, 2)
y = torch.tensor([0, 1, 1], dtype=torch.float64).to(model.dv)

pred_y = model.forward(x)
L = torch.pow(y - pred_y, 2) # using MSE loss
dL = 2 * pred_y - 2 * y
val = model.backward(dL)
truth = -1.249252247115394
if (rel_err(torch.tensor(truth).to(model.dv), val) < 1e-6).cpu().numpy():
    print(f'Correct')
else:
    print(f'Expected {truth} but got {val.cpu().numpy()}')

Expected -1.249252247115394 but got [0.]
