## Probabilistic modelling for linear regression

\begin{equation}
\left[\begin{array}{c} y_1 \\ y_2 \\ \vdots \\ y_m \end{array} \right] = \left[\begin{array}{cccc} 
x_{11} & x_{12} & \cdots & x_{1n} \\ 
x_{21} & x_{22} & \cdots & x_{2n} \\ 
\vdots & \vdots & \ddots & \vdots \\ 
x_{m1} & x_{m2} & \cdots & x_{mn}
\end{array} \right ] \; \left[\begin{array}{c} \theta_1 \\ \theta_2 \\ \vdots \\ \theta_n \end{array} \right]
\end{equation}

- **Dataset:** $D = \{y_{i},x_{i}\}_{i=1}^m$
- **Inputs(features):** $x_{i} \in \mathbb{R}^n, \;i = 1, \ldots ,m$
- **Outputs:** $y_{i} \in \mathcal{Y}, \;i = 1, \ldots ,m$
- **Parameters:** $\theta \in \mathbb{R}^n$
- **Hypothesis:** $h_\theta(x) = \theta^Tx$
- **Linear model:** $y_{i} \approx \theta^Tx_{i}$
\begin{equation}
y_{i} = \theta^Tx_{i} + \epsilon_{i}
\end{equation}
\begin{equation}
\epsilon_{i} \sim \mathcal{N}(0 , \sigma^{2})
\end{equation}
$\epsilon_{i} \leftarrow$ idenpendent,identically distributed random variable
\begin{equation}
p(\epsilon_{i}) = \frac{1}{\sqrt{2\pi}\sigma}\; \exp\left(-\frac{\epsilon_{i}^{2}}{2\sigma^{2}}\right)
\end{equation}
\begin{equation}
p(y_{i} - \theta^Tx_{i}) = \frac{1}{\sqrt{2\pi}\sigma} \; \exp\left(-\frac{(y_{i} - \theta^Tx_{i})^2}{2\sigma^{2}}\right)
\end{equation}
\begin{equation}
p(y_{i} \mid x_{i} , \theta) = \frac{1}{\sqrt{2\pi}\sigma} \; \exp\left(-\frac{(y_{i} - \theta^Tx_{i})^2}{2\sigma^{2}}\right)
\end{equation}

- **Maximum Likelihood  Estimation(MLE) for $\theta$:**

\begin{equation}
\begin{split}
\theta^* & = \arg\max_\theta \; L(\theta \mid D) \\ &
= \arg\max_\theta \; p(D \mid \theta) \\ &
= \arg\max_\theta \; p(y_1,x_1,y_2,x_2, \ldots ,y_m,x_m \mid \theta) \\ &
= \arg\max_\theta \; p(y_1,x_1 \mid \theta) \; p(y_2,x_2 \mid \theta) \ldots p(y_m,x_m \mid \theta) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i,x_i \mid \theta) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i \mid x_i,\theta) \; p(x_i \mid \theta) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i \mid x_i,\theta) \; p(x_i) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i \mid x_i,\theta) \\ &
= \arg\max_\theta \; \sum_{i=1}^m \log p(y_i,x_i \mid \theta) \\ &
= \arg\max_\theta \; \sum_{i=1}^m \log \left(\frac{1}{\sqrt{2\Pi}\sigma}\right) + \log\left[\exp\left(-\frac{(y_i-\theta^Tx_i)^2}{2\sigma^2}\right)\right] \\ &
= \arg\max_\theta \; \sum_{i=1}^m -\frac{1}{2\sigma^2}\left(y_i-\theta^Tx_i\right)^2 \\ &
= \arg\min_\theta \; \frac{1}{m}\sum_{i=1}^m \left(y_i-\theta^Tx_i\right)^2
\end{split}
\end{equation}

- **Cost function:** $E(\theta) = \frac{1}{m}\sum_{i=1}^m \left(y_i-\theta^Tx_i\right)^2$ 
- **Loss fuction:** $ \ell(h_\theta(x), y) = (h_\theta(x) - y)^2$

- **Estimation with MAP for $\theta$:**

\begin{equation}
p(\theta) = \frac{1}{\sqrt{2\pi}r}\; \exp\left(-\frac{\theta^T\theta}{2r^2}\right)
\end{equation}

\begin{equation}
\begin{split}
\theta^* & = \arg\max_\theta \; p(\theta \mid D) \\ &
= \arg\max_\theta \; p(D \mid \theta) \; p(\theta) \\ &
= \arg\max_\theta \; p(y_1,x_1,y_2,x_2, \ldots ,y_m,x_m \mid \theta) \; p(\theta) \\ &
= \arg\max_\theta \; p(y_1,x_1 \mid \theta) \; p(y_2,x_2 \mid \theta) \ldots p(y_m,x_m \mid \theta) \; p(\theta) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i,x_i \mid \theta) \; p(\theta) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i \mid x_i,\theta) \; p(x_i \mid \theta) \; p(\theta) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i \mid x_i,\theta) \; p(x_i) \; p(\theta) \\ &
= \arg\max_\theta \; \prod_{i=1}^mp(y_i \mid x_i,\theta) \; p(\theta) \\ &
= \arg\max_\theta \; \sum_{i=1}^m \log p(y_i,x_i \mid \theta) + \log p(\theta) \\ &
= \arg\max_\theta \; \sum_{i=1}^m \log \left(\frac{1}{\sqrt{2\Pi}\sigma}\right) + \log\left[\exp\left(-\frac{(y_i-\theta^Tx_i)^2}{2\sigma^2}\right)\right] + \log \left(\frac{1}{\sqrt{2\Pi}r}\right) + \log\left[\exp\left(-\frac{\theta^T\theta}{2r^2}\right)\right] \\ &
= \arg\max_\theta \; \sum_{i=1}^m -\frac{1}{2\sigma^2}\left(y_i-\theta^Tx_i\right)^2 + -\frac{\theta^T\theta}{2r^2} \\ &
= \arg\min_\theta \; \frac{1}{m}\sum_{i=1}^m \left(y_i-\theta^Tx_i\right)^2 + \lambda\mid\mid\theta\mid\mid_2^2
\end{split}
\end{equation}

- **Cost function:** $E(\theta) = \frac{1}{m}\sum_{i=1}^m \left(y_i-\theta^Tx_i\right)^2 + \lambda\mid\mid\theta\mid\mid_2^2$ 
- **Loss fuction:** $ \ell(h_\theta(x), y) = (h_\theta(x) - y)^2 + \lambda \mid\mid\theta\mid\mid_2^2$