In [41]:
import pandas as pd
import copy, math
import numpy as np
import matplotlib.pyplot as plt
import random as rd

In [42]:
data = pd.read_csv("data.csv")
print(data)

     X1   X2   X3     Y
0   2.5  7.8  3.2  15.3
1   3.0  6.5  3.9  17.2
2   2.8  5.8  2.8  14.3
3   3.5  6.2  3.5  16.5
4   3.2  7.1  3.1  16.4
..  ...  ...  ...   ...
59  3.1  6.5  3.1  16.7
60  3.7  7.0  3.7  17.8
61  3.3  6.8  3.3  17.1
62  2.6  7.2  2.6  15.6
63  3.4  6.6  3.4  17.0

[64 rows x 4 columns]


In [43]:
x = pd.DataFrame(data.iloc[:,:-1])
x_train = []
for i in range(len(x)):
    x_train.append(x.loc[i])
x_train = np.array(x_train)
y_train = np.array(data["Y"])

In [44]:
w_init = []
for i in range(x_train.shape[1]):
    w_init.append(rd.random())

w_init = np.array(w_init)
b_init = rd.random()

In [45]:
def compute_cost(x,y,w,b):
    m = x.shape[0]
    cost = 0.0
    for i in range(m):                                
        f_wb_i = np.dot(x[i], w) + b
        cost = cost + (f_wb_i - y[i])**2       
    cost = cost / (2 * m)                         
    return cost

In [46]:
cost = compute_cost(x_train, y_train, w_init, b_init)
print(f'Cost at optimal w : {cost}')

Cost at optimal w : 68.36682009165044


In [47]:
np.zeros((3,))

array([0., 0., 0.])

In [48]:
x_train[1,1]

6.5

In [49]:

def compute_gradient(X, y, w, b): 
    """
    Computes the gradient for linear regression 
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
               #(number of examples, number of features)
    m,n = X.shape
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):                             
        err = (np.dot(X[i], w) + b) - y[i]   
        for j in range(n):                         
            dj_dw[j] = dj_dw[j] + err * X[i,j]    
        dj_db = dj_db + err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m  
    return dj_db, dj_dw                              
        

In [50]:
x_train[1,1]

6.5

In [51]:
tmp_dj_db, tmp_dj_dw = compute_gradient(x_train, y_train, w_init, b_init)
print(f'dj_db at initial w,b: {tmp_dj_db}')
print(f'dj_dw at initial w,b: \n {tmp_dj_dw}')

dj_db at initial w,b: -11.67944057976988
dj_dw at initial w,b: 
 [-37.36450925 -81.09135284 -37.78688667]


In [52]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   ##None

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]:8.2f}   ")
        
    return w, b, J_history #return final w,b and J history for graphing

In [58]:
# initialize parameters
initial_w = np.zeros_like(w_init)
initial_b = 0.
# some gradient descent settings
iterations = 10000
alpha = 0.001
# run gradient descent 
w_final, b_final, J_hist = gradient_descent(x_train, y_train, initial_w, initial_b,
                                                    compute_cost, compute_gradient, 
                                                    alpha, iterations)
print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")
m,_ = x_train.shape
for i in range(m):
    print(f"prediction: {np.dot(x_train[i], w_final) + b_final:0.2f}, target value: {y_train[i]}")

Iteration    0: Cost   122.64   
Iteration 1000: Cost     0.14   
Iteration 2000: Cost     0.12   
Iteration 3000: Cost     0.10   
Iteration 4000: Cost     0.09   
Iteration 5000: Cost     0.09   
Iteration 6000: Cost     0.09   
Iteration 7000: Cost     0.09   
Iteration 8000: Cost     0.08   
Iteration 9000: Cost     0.08   
b,w found by gradient descent: 0.35,[1.29786357 1.22572121 1.18432955] 
prediction: 16.94, target value: 15.3
prediction: 16.83, target value: 17.2
prediction: 14.41, target value: 14.3
prediction: 16.63, target value: 16.5
prediction: 16.87, target value: 16.4
prediction: 14.64, target value: 15.7
prediction: 19.21, target value: 18.2
prediction: 15.64, target value: 15.8
prediction: 18.11, target value: 17.4
prediction: 17.24, target value: 16.8
prediction: 16.47, target value: 16.1
prediction: 16.61, target value: 17.0
prediction: 17.97, target value: 17.8
prediction: 15.99, target value: 16.3
prediction: 17.49, target value: 17.6
prediction: 18.74, target va