In [None]:
def init_parameters(len_w):
    # weights - for a standard normal distribution with variance=1
    w = np.random.randn(1, len_w)  
    # w: a row vector of shape (1,len_w)
    b = 0 # initial bias == 0
    return w, b

def f_prop(X, w, b):
    """
    w: weights, 1 by n 
    X: design matrix, n by m 
    """
    y_hat = np.dot(w, X) + b
    return y_hat

def cost_function(y_hat, y):
    m = y.shape[1]
    J = (1/(2*m)) * np.sum(np.square(y_hat-y))
    return J

def b_prop(X, y, y_hat):
    m = y.shape[1]
    dy_hat = (1/m) * (y_hat - y)
    dw = np.dot(dy_hat, X.T) # dy_hat is 1xm, X.T is mxn, so dw is 1xn
    db = np.sum(dy_hat)
    return dy_hat, db

def gradient_descent_update(w, b, dw, db, lrate):
    print(type(dw), type(db), dw.shape, db.shape, type(lrate))
#     w = w - np.dot(lrate, dw)
#     b = b - np.dot(lrate, db) 
#     return w, b

def LR_model(X_train, y_train, X_val, y_val, lrate, epochs):
    """
    X_train:
    y_train: 
    X_val:
    y_val:
    lrate: 
    epochs: 
    """
    len_w = X_train.shape[0]
    w, b = init_parameters(len_w)
    
    costs_train = []
    m_train = y_train.shape[1]
    m_val = y_val.shape[1] # number of validation examples
    for i in range(1, epochs+1): # Why use epochs+1? We are starting from 1, not 0.
        y_hat_train = f_prop(X_train, w, b)
        cost_train = cost_function(y_hat_train, y_train)
        dw, db = b_prop(X_train, y_train, y_hat_train)
        w, b = gradient_descent_update(w, b, dw, db, lrate)
        
        # store training costs in a list to plot them.
        if i%10==0:
            costs_train.append(cost_train)
        
        # MAE_train
        MAE_train  = (1/m_train) * np.sum(np.abs(y_hat_train - y_train))
        
        # cost_val, MAE_val : cost and MAE for the validation set
        y_hat_val = f_prop(X_val, w, b)
        cost_val = cost_function(y_hat_val, y_val)
        MAE_val = (1/m_val) * np.sum(np.abs(y_hat_train - y_train))
        
        print('Epochs ' + str(i) + '/' + str(epochs) + ': ')
        print('Training cost ' + str(cost_train) + '| '+'Validation cost '+ str(cost_val))
        print('Training MAE ' + str(MAE_train) + '| '+'Validation MAE '+ str(MAE_val))
    
    plt.plot(costs_train)
    plt.xlabel('Iterations (every tenth one)')
    plt.ylabel('Training cost')
    plt.title('Learning rate ' + str(lrate))
    plt.show()

# Normalize input matrix b/w -1 and 1 for gradient descent
X = (df - df.mean()) / (df.max() - df.min())
X.describe()

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=7)

# The model wer'e using need X_train to have dimension (n_features x n_training_samples)
X_train = X_train.T
# Similarly, y_train should be 1xm, where m_train == number of training samples
y_train = y_train.T
# X_val needs dimension (n_features x n_val)
X_val = X_val.T
# y_val needs dimension (1xm), where m == m_val
y_val = y_val.T