In [14]:
"""
    Multivariate Linear Regression
    ---with lasso and ridge regularization
"""

import matplotlib.pyplot as plt
import numpy as np
import math

"""
    Squared Error
        x      : training data
        y      : outputs
        th     : theta 
        lb     : lambda value for regularization
        reg    : regularization function

"""
def cost(x, y, th, lb, reg):
    diff = x @ th - y
    t = (1 / (2 * x.shape[0])) * diff.transpose() @ diff + (0 if reg == None else reg(lb, th))
    ans = np.squeeze(t)
    return ans

"""
    Gradient Descent
        x      : training data
        y      : outputs
        th     : theta
        lb     : lambda value for regularization
        reg    : regularization function
        regder : regularization function derivative
"""
def gr_dsc(x, y, th, lb = 0, reg = None, regder = None):
    iters = 10000
    alpha = 0.001
    while(iters != 0) :
        th = th - alpha*(x.transpose() @ (x @ th - y) + (0 if regder == None else regder(lb, th)))
        iters -= 1
    print("Cost : ", cost(x,y,th,lb, reg))
    
"""
    Lasso regularization
        th : theta
        lb : lambda value for regularization
"""
def lasso_reg(lb, th):
    a = 0;
    for i in th:
        a = a + abs(i)
    return a * lb

"""
    Lasso regularization derivative
        th : theta
        lb : lambda value for regularization
"""
def lasso_regder(lb, th):
    return lb * th / abs(th)

"""
    Ridge regularization
        th : theta
        lb : lambda value for regularization
"""
def ridge_reg(lb, th):
    return (lb / 2) * np.squeeze(th.transpose() @ th)

"""
    Ridge regularization derivative
        th : theta
        lb : lambda value for regularization
"""
def ridge_regder(lb, th):
    return lb * th

"""
    Plots Cost vs theta(i) for all theta
        x      : training data
        y      : outputs
        th     : theta
"""
def plotter(x, y, th):
    ti = np.linspace(-10, 10, num=1000)
    for i in range(th.shape[0]):
        th1 = np.copy(th)
        jv = []
        for j in ti:
            th1[i][0] = j
            jv.append(cost(x, y, th1))
        print(min(jv))
        plt.plot(ti, jv)
        plt.show()
        
"""
    Error calculation
        x      : testing data
        y      : outputs
        th     : theta 
"""

def error_calculations(x, y, th):
    diff = x @ th - y
    sum_of_squared_errors = diff.transpose() @ diff
    Eabs = sum(np.absolute(diff)) / x.shape[0]
    Erms = math.sqrt(sum_of_squared_errors[0][0]) / x.shape[0]
    print("Eabs : {}".format(Eabs[0]))
    print("Erms : {}".format(Erms))
    
    
#file name for data
f_name = "real_estate.csv"
data = np.genfromtxt(f_name, delimiter=",", skip_header=1)

rows = data.shape[0]
columns = data.shape[1]

th = np.ones((columns, 1))
x = np.ones((rows, columns))
y = np.zeros((rows, 1))

#min and max for all columns
mins = data.min(axis=0)
maxs = data.max(axis=0)

#normalize
for i in range(columns):
    if i == columns - 1:
        for j in range(rows):
            y[j][0] = (data[j][i] - mins[i])/(maxs[i] - mins[i])
    else :
        for j in range(rows):
            x[j][i + 1] = (data[j][i] - mins[i])/(maxs[i] - mins[i])

#splitting into learning and testing sets
x_learning = x[:int(0.8*rows) + 1,:]
x_testing = x[int(0.8*rows) + 1:,:]
y_learning = y[:int(0.8*rows) + 1,:]
y_testing = y[int(0.8*rows) + 1:,:]
th_copy = th.copy()

gr_dsc(x_learning, y_learning, th_copy)

error_calculations(x_testing, y_testing, th_copy)

Cost :  0.0033736656099699015
Eabs : 3.285329010719529
Erms : 0.36834115812844703
