## Linear  regression and gradient descent from scratch

source: https://mubaris.com/posts/linear-regression/

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import requests
from IPython.display import display, Math, Latex

In [None]:
df = pd.read_csv("../input/headbrain.csv")

In [None]:
df.head()

\begin{equation}
Y = \beta_0 + \beta_1X
\end{equation}

\begin{equation}
\beta_1 = \frac{\sum_{i=1}^{m} (x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^{m} (x_i - \bar{x})^2}
\end{equation}

\begin{equation}
\beta_0 = \bar{y} - \beta_1\bar{x}
\end{equation}

In [None]:
X = df.iloc[:,-2].values
Y = df.iloc[:,-1].values
print(X.shape)
print(Y.shape)

In [None]:
def ord_LinReg_fit(X,Y):
    x_mean = np.mean(X)
    y_mean = np.mean(Y)

    m = len(X)

    number = 0
    denom  = 0
    for i in range(m):
        number += (X[i] - x_mean)*(Y[i] - y_mean)
        denom += (X[i] - x_mean)**2

    # coef.
    b_1 = number/denom
    b_0 = y_mean - (b_1*x_mean)
    return b_0,b_1

In [None]:
max_x = np.max(X) + 100
min_x = np.min(X) - 100


x = np.linspace(min_x, max_x, 1000)
b = ord_LinReg_fit(X,Y)
y = b[0] + b[1] * x

In [None]:
plt.plot(x,y, color='g')
plt.scatter(X, Y, c='b', label='Scatter Plot')

#### Metrics

###### Root mean square error

\begin{equation}
RMSE = \sqrt{\sum_{i=1}^{m} \frac{1}{m} (\hat{y_i} - y_i)^2}
\end{equation}

In [None]:
y_pred = [(lambda x:b[0] + b[1]*x)(x) for x in X]
def rmse(Y, y_pred):
    rmse = 0
    m = len(Y)
    for i in range(m):
        rmse += (Y[i] - y_pred[i])**2
    rmse = np.sqrt(rmse/m)
    return rmse
rmse(Y,y_pred)

In [None]:
Y[0],y_pred[0]

###### r^2 coefficient

\begin{equation}
SS_t = \sum_{i=1}^{m} (y_i - \bar{y})^2
\end{equation}

\begin{equation}
SS_r = \sum_{i=1}^{m} (y_i - \hat{y_i})^2
\end{equation}
\begin{equation}
R^2 \equiv 1 - \frac{SS_r}{SS_t}
\end{equation}

In [None]:
def r_2(Y,y_pred):
    ss_t = 0
    ss_r = 0
    y_mean = np.mean(Y)
    for i in range(len(Y)):
        ss_t += (Y[i] - y_mean)**2
        ss_r += (Y[i] - y_pred[i])**2
    r_2 = 1 - (ss_r/ss_t)
    return r_2
r_2(Y,y_pred)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Cannot use Rank 1 matrix in scikit learn
X = X.reshape((len(df), 1))
# Creating Model
reg = LinearRegression()
# Fitting training data
reg = reg.fit(X, Y)
# Y Prediction
Y_pred = reg.predict(X)

# Calculating RMSE and R2 Score
mse = mean_squared_error(Y, Y_pred)
r2_score = reg.score(X, Y)

print(np.sqrt(mse))
print(r2_score)

## Multiple regression

\begin{equation}
Y = \beta_0x_0 + \beta_1x_1 + \beta_1x_2 + … + \beta_nx_n,
\end{equation}

\begin{equation}
x_0 = 1
\end{equation}

#### Least square method

\begin{equation}
\theta = (X^{T}*X)^{-1}X^{T}y
\end{equation}

In [None]:
def get_X(X):
    m = len(X)
    X = np.c_[np.ones(m),X]
    return X

In [None]:
def LSM_fit(X,Y):
    X_inv = np.linalg.inv(np.matmul(X.T,X))
    middle_res = np.matmul(X_inv,X.T)
    theta = np.matmul(middle_res,Y)
    return theta

In [None]:
X_ = get_X(df.iloc[:,:3].values)
theta = LSM_fit(X_, Y)
print(X_.shape)

In [None]:
def LSM_grad_predict(X,theta):
    result = []
    for i in X:       
        r = sum(theta*i)
        result.append(r)
    return result

In [None]:
y_pred_lsm = LSM_grad_predict(X_, theta)

In [None]:
Y[0], y_pred_lsm[0]

In [None]:
rmse(Y,y_pred_lsm)

In [None]:
r_2(Y,y_pred_lsm)

### Gradient descent

\begin{equation}
\beta = \begin{bmatrix}\beta_0 & \beta_1 & \beta_2 & .. & \beta_n\end{bmatrix}^T
\end{equation}


\begin{equation}
X = \begin{bmatrix}x_0 & x_1 & x_2 & .. & x_n\end{bmatrix}^T
\end{equation}


\begin{equation}
h_\beta(x) = \beta^Tx
\end{equation}


\begin{equation}
J(\beta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\beta(x^{\textrm{(i)}}) - y^{\textrm{(i)}})^2
\end{equation}

\begin{equation}
\beta_j := \beta_j - \alpha\frac{\partial}{\partial \beta_j} J(\beta)
\end{equation}

\begin{equation}
\beta_j := \beta_j - \alpha\frac{1}{m}\sum_{i=1}^m (h_\beta(x^{(i)})-y^{(i)})x_{j}^{(i)}
\end{equation}

In [None]:
def cost_function(X, Y, B):
    m = len(Y)
    J = np.sum((X.dot(B) - Y) ** 2)/(2 * m)
    return J

In [None]:
B = [0,0,0,0]
inital_cost = cost_function(X_, Y, B)
print(inital_cost)

In [None]:
def gradient_descent(X, Y, B, alpha, iterations):
    
    cost_history = [0] * iterations
    m = len(Y)
    
    for iteration in range(iterations):
        h = X.dot(B)
        loss = h - Y
        gradient = X.T.dot(loss) / m
        B = B - alpha * gradient
        cost = cost_function(X, Y, B)
        cost_history[iteration] = cost
        
    return B, cost_history

In [None]:
B = [0,0,0,0]
alpha = .0000001
newB, hist = gradient_descent(X_, Y, B, alpha, 1000)

In [None]:
newB

In [None]:
y_pred_gd = LSM_grad_predict(X_,newB)

In [None]:
print(f'{Y[0]} =>{y_pred_gd[0]}')

In [None]:
rmse(Y,y_pred_gd)

In [None]:
r_2(Y,y_pred_gd)

**So, it would be better to use Least square method for this data set, as it gave the best prediction result, according to RMSE and r^2 score metrics.**