In [1]:
import sys
print(sys.executable)

/usr/local/bin/python3


In [2]:
#imports

import numpy as np


In [None]:
#task 1


In [3]:
#useful functions
def l2_loss(y, yh):
  return 0.5 * (yh - y)**2

def l2_loss_grad(y, yh):
  return yh - y

def cross_entropy(y, yh):
  return -np.sum(y * np.log(yh + 1e-12))

# note that this is true only for dL/dz, L = loss(softmax(z))
def cross_entropy_grad(y, yh):
  return yh - y

def relu(x):
  return np.maximum(0, x)
   
def relu_grad(x):
    return (x > 0).astype(float)

def leaky_relu(x, alpha=0.1):
   return np.maximum(alpha*x, x)

def leaky_relu_grad(x, alpha=0.1):
    grad = np.ones_like(x)
    grad[x < 0] = alpha
    return grad

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_grad(x):
    s = sigmoid(x)
    return s * (1 - s)
  
def tanh(x):
  return np.tanh(x)

def tanh_grad(x):
  t = np.tanh(x)
  return 1 - t * t

def linear(x):
  return x

def linear_grad(x):
  return np.ones_like(x)

def softmax(x):
  z = x - np.max(x)
  e = np.exp(z)
  return e / np.sum(e)

In [5]:
#task 2 implement LSTM

class Cell:
    def __init__(self, input_size, hidden_size, b_f=None, b_i=None, b_o=None, b_c=None):
        #random weight matrices for now
        self.W_f = np.random.randn(hidden_size, input_size + hidden_size) * 0.01
        self.W_i = np.random.randn(hidden_size, input_size + hidden_size) * 0.01
        self.W_c = np.random.randn(hidden_size, input_size + hidden_size) * 0.01
        self.W_o = np.random.randn(hidden_size, input_size + hidden_size) * 0.01
        #initialize biases
        self.b_f = b_f if b_f is not None else np.ones(hidden_size)  
        self.b_i = b_i if b_i is not None else np.zeros(hidden_size)
        self.b_o = b_o if b_o is not None else np.zeros(hidden_size)
        self.b_c = b_c if b_c is not None else np.zeros(hidden_size)

    #forget gate
    def get_f_t(self, h_prev, x): 
        """"
        computes the forget value given h_{t-1} (prev hidden state), x_t (input), b_f (current forget bias)
    
        """
        concat = np.concatenate([h_prev, x], axis=-1)
        return sigmoid(np.dot(self.W_f, concat) + self.b_f)
    
    #input gate
    def get_i_t(self, h_prev, x):
        concat = np.concatenate([h_prev, x], axis=-1)
        return sigmoid(np.dot(self.W_i, concat) + self.b_i)
    
    def get_c_hat(self, h_prev, x):
        concat = np.concatenate([h_prev, x], axis=-1)
        return tanh(np.dot(self.W_c, concat) + self.b_c)
    
    def get_c_t(self, c_prev, h_prev, x):
        c_hat = self.get_c_hat(h_prev, x)
        i_t = self.get_i_t(h_prev, x)
        forget = self.get_f_t(h_prev, x)
        return forget * c_prev + i_t * c_hat
    
    #output gate
    def get_o_t(self, h_prev, x):
        concat = np.concatenate([h_prev, x], axis=-1)
        return sigmoid(np.dot(self.W_o, concat) + self.b_o)

    def get_h(self, o_t, c_t):
        return o_t * tanh(c_t)
    
    def forward(self, h_prev, c_prev, x): # <<<<<<<<<<<<<<<---------------------------- use this!
        """ 
        computes a forward pass of the entire cell
        returns: h_t, c_t
        """
        f_t = self.get_f_t(h_prev, x)
        i_t = self.get_i_t(h_prev, x)
        c_t = self.get_c_t(c_prev, h_prev, x)
        o_t = self.get_o_t(h_prev, x)
        h_t = self.get_h(o_t, c_t)
        self.cache = {
            'h_prev': h_prev, 'c_prev': c_prev, 'x': x,
            'f_t': f_t, 'i_t': i_t, 'c_hat': c_hat,
            'c_t': c_t, 'o_t': o_t, 'concat': concat
        }
        return h_t, c_t
    
    def backward(self, dh_next, dc_next):
        """
        dh_next: gradient of loss w.r.t h_t (from output or next timestep)
        dc_next: gradient of loss w.r.t C_t (from next timestep)
        returns: dx, dh_prev, dc_prev
        and stores dW/db for optimizer
        """
        # retrieve cached values
        f_t = self.cache['f_t']
        i_t = self.cache['i_t']
        c_hat = self.cache['c_hat']
        o_t = self.cache['o_t']
        c_prev = self.cache['c_prev']
        concat = self.cache['concat']

        c_t = self.cache['c_t']

        # derivative of loss w.r.t c_t (total)
        dc_t = dh_next * o_t * (1 - np.tanh(c_t)**2) + dc_next

        # derivatives w.r.t gates before activation
        do = dh_next * np.tanh(c_t)
        di = dc_t * c_hat
        df = dc_t * c_prev
        dc_hat = dc_t * i_t

        # apply activation derivatives
        dZ_o = do * o_t * (1 - o_t)
        dZ_i = di * i_t * (1 - i_t)
        dZ_f = df * f_t * (1 - f_t)
        dZ_c = dc_hat * (1 - c_hat**2)

        # gradients w.r.t weights and biases
        self.dW_o = np.outer(dZ_o, concat)
        self.dW_i = np.outer(dZ_i, concat)
        self.dW_f = np.outer(dZ_f, concat)
        self.dW_c = np.outer(dZ_c, concat)

        self.db_o = dZ_o
        self.db_i = dZ_i
        self.db_f = dZ_f
        self.db_c = dZ_c

        # gradient w.r.t concat
        dconcat = (self.W_o.T @ dZ_o +
                   self.W_i.T @ dZ_i +
                   self.W_f.T @ dZ_f +
                   self.W_c.T @ dZ_c)

        # split into dx and dh_prev
        hidden_size = self.cache['h_prev'].shape[0]
        dx = dconcat[hidden_size:]
        dh_prev = dconcat[:hidden_size]
        dc_prev = dc_t * f_t

        return dx, dh_prev, dc_prev


class LSTM:

    def __init__(self, cell_count): #initialize a network with #cell_count cells
        for i in range(cell_count)
    def fit(self, X, Y, batch_size, lr, epochs):

    
    

    