# Linear Regression

$$
x =
\begin{bmatrix}
1 \\
x_1 \\
x_2 \\
\vdots \\
x_d
\end{bmatrix}

w =
\begin{bmatrix}
w_0 \\
w_1 \\
w_2 \\
\vdots \\
w_d
\end{bmatrix}
$$

$$
\hat{y} = w^Tx = x^Tw =
\begin{bmatrix}
w_0 & w_1 & w_2 & \cdots & w_d
\end{bmatrix}
\begin{bmatrix}
1 \\
x_1 \\
x_2 \\
\vdots \\
x_d
\end{bmatrix}
= w_0 + w_1x_1 + w_2x_2 + \cdots + w_dx_d
$$


In [28]:
import numpy as np

x1 = np.array([
    [1], # bias
    [6], # x1
    [2], # x2
    [9], # x3
    [4], # x4
    [5], # x5
])

w = np.array([
    [3], # bias
    [2], # w1
    [3], # w2
    [4], # w3
    [5], # w4
    [6], # w5
])

y1hat = w.T @ x1

print(y1hat)

[[107]]


$$
X =
\begin{bmatrix}
1 & x^1_1 & x^1_2 & ... & x^1_d \\
1 & x^2_1 & x^2_2 & ... & x^2_d \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
1 & x^n_1 & x^n_2 & ... & x^n_d \\
\end{bmatrix}
=
\begin{bmatrix}
x^{1 T} \\
x^{2 T} \\
\vdots \\
x^{n T} \\
\end{bmatrix}
$$

$$
\hat{y} = Xw =
\begin{bmatrix}
\hat{y}^1 \\
\hat{y}^2 \\
\vdots \\
\hat{y}^n \\
\end{bmatrix}
$$

In [29]:
X = np.array([
    [1, 6, 2, 9, 4, 5],
    [1, 7, 3, 8, 5, 6],
    [1, 8, 4, 3, 6, 5],
    [1, 9, 5, 6, 7, 4],
    [1, 10, 66, 5, 8, 3],
    [1, 11, 4, 4, 9, 62],
    [1, 12, 8, 12, 10, 1]
])

Yhat = X @ w

print(Yhat)

[[107]
 [119]
 [103]
 [119]
 [299]
 [470]
 [155]]


## Probability Density Function (Normal Distribution)


In [30]:
Y = np.array([
    [100],
    [100],
    [100],
    [100],
    [300],
    [500],
    [100]
])


$$
\mathcal{N}(x; \mu, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
$$

$$
y = w^Tx + \epsilon, \epsilon \sim \mathcal{N}(0, \sigma^2)
$$

$$
\mathcal{N}(y^i | x^i; w, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y^i - w^Tx^i)^2}{2\sigma^2}\right)
$$

In [31]:
from math import sqrt, exp, pi

# probability density function of a normal distribution
def pdf(yi: np.array, xi: np.array, w: np.array, sigma_2: float) -> float:
    return 1 / (sqrt(2 * pi) * sigma_2) * exp(-1 / (2 * sigma_2 ** 2) * (yi - xi @ w) ** 2)

In [32]:
pdf(Y[0], X[0], w, 1)

9.134720408364594e-12

### Maximum likelihood

$$
\mathcal{N}(y | X; w, \sigma^2) = \prod_{i=1}^n \mathcal{N}(y^i | x^i; w, \sigma^2)
$$


In [35]:
def likelihood(y: np.array, x: np.array, w: np.array, sigma_2: float) -> float:
    return np.sum([pdf(y[i], x[i], w, sigma_2) for i in range(len(y))])

In [43]:
likelihood(Y, X, w, 1)

0.24640257294021609

### Negative log-likelihood

$$
-\log \mathcal{N}(y | X; w, \sigma^2) 
= \sum_{i=1}^n \log \mathcal{N}(y^i | x^i; w, \sigma^2) 
= \sum_{i=1}^n \begin{bmatrix}\frac{1}{2}\log{2\pi\sigma^2} + \frac{1}{2\sigma^2}(y^i - w^Tx^i)^2\end{bmatrix}
$$

In [40]:
from math import log

def log_pdf(yi: np.array, xi: np.array, w: np.array, sigma_2: float) -> float:
    return 1 / 2 * log(2 * pi * sigma_2) + 1 / (2 * sigma_2 ** 2) * (yi - xi @ w) ** 2

In [41]:
def log_likelihood(y: np.array, x: np.array, w: np.array, sigma_2: float) -> float:
    return np.sum([log_pdf(y[i], x[i], w, sigma_2) for i in range(len(y))])

In [47]:
log_likelihood(Y, X, w, 1)

2359.432569732433

### Ignoring sigma

$$
\sum_{i=1}^n \frac{1}{2}(y^i - w^Tx^i)^2 = \sum_{i=1}^n (y^i - \hat{y}^i)^2
$$

In [54]:
def squared_error(y: np.array, x: np.array, w: np.array) -> float:
    return ((y - x @ w) ** 2) / 2

In [57]:
squared_error(Y[0], X[0], w)

array([24.5])

In [52]:
def loss(y: np.array, x: np.array, w: np.array) -> float:
    return np.sum(squared_error(y, x, w))

In [58]:
loss(Y, X, w)

2353.0

### Mean Squared Error (MSE)

$$
\mathcal{L}(w) = \frac{1}{2n} \sum_{i=1}^n (y^i - \hat{y}^i)^2
$$

In [59]:
def mse(y: np.array, x: np.array, w: np.array) -> float:
    return loss(y, x, w) / len(y)

In [60]:
mse(Y, X, w)

336.14285714285717