In [1]:
import torch

torch.set_default_dtype(torch.float64)

# Dataset

$$
\begin{align}
&p\left ( y \mid \boldsymbol{x}^T \boldsymbol{\theta} \right )
= \mathcal{N} \left ( y \mid \boldsymbol{x}^T \boldsymbol{\theta}, \sigma^2 \right ) \\
\Leftrightarrow &y = \boldsymbol{x}^T \boldsymbol{\theta} + \epsilon,
\quad \epsilon \sim \mathcal{N}\left ( 0, \sigma^2 \right )
\end{align}
$$
we assume that the noise variance $\sigma^2$ is know and focus on learning the model parameters $\boldsymbol{\theta}$.
'https://mml-book.com.'

In [17]:
M, N = 100, 3
X = torch.randint(-10, 11, (M, N), dtype=torch.float64)

TrueTheta = torch.randint(-8, 9, (N, 1), dtype=torch.float64)
noise_variance = 0.01

Y = X @ TrueTheta + torch.normal(0, noise_variance, (M, 1))

print(X.shape)
print(Y.shape)
print(TrueTheta.shape)

torch.Size([100, 3])
torch.Size([100, 1])
torch.Size([3, 1])


Give a traing set $\left \{ \left ( \boldsymbol{x}_1, y_1 \right ), \cdots ,\left ( \boldsymbol{x}_M, y_M \right ) \right \}$. 
$$
\begin{align}
p(\mathcal Y | \mathcal X, \boldsymbol\theta) &= p\left ( y_1, \cdots, y_N \mid \boldsymbol{x}_1, \cdots, \boldsymbol{x}_N, \boldsymbol{\theta} \right ) \\
&= \prod_{n=1}^N p(y_n | \boldsymbol x_n, \boldsymbol\theta)
= \prod_{n=1}^N \mathcal{N}(y_n | \boldsymbol x_n^T \boldsymbol\theta, \sigma^2)
\end{align}
$$
Where we defined $\mathcal X = \{\boldsymbol x_1, \ldots, \boldsymbol x_N\}$ and $\mathcal Y = \{y_1, \ldots, y_N\}$.

## Maximun likelihood estimation

Problem to solve:
$$
\boldsymbol\theta_{ML} = \arg \max_{\boldsymbol\theta} p(\mathcal X \mid \mathcal Y, \boldsymbol\theta)
$$
To find the optimal parameters, we minimize the negative log-likelihood
$$
-\log p(\mathcal Y \mid \mathcal X, \boldsymbol\theta) =
-\log \prod_{m=1}^{M} p(y_m \mid \boldsymbol x_M, \boldsymbol\theta) =
-\sum_{m=1}^{M} \log  p(y_m \mid \boldsymbol x_M, \boldsymbol\theta)
$$
where we assume that factorizes over the number of data points due to our independence assumption on training set.
$$
\log p(y_m \mid \boldsymbol x_M, \boldsymbol\theta) =
-\frac{1}{2\sigma^2}\left ( y_m - \boldsymbol x_m^T \boldsymbol\theta \right )^2 + \text{const}
$$

we include all probabilities with the sum on each data point, and we define a function to do this:
$$
\begin{align}
\boldsymbol L(\boldsymbol\theta) :&= \frac{1}{2\sigma^2} \sum_{m=1}^{M} \left ( y_m - \boldsymbol x_m^T \boldsymbol\theta \right )^2 \\
&= \frac{1}{2\sigma^2} \left ( \boldsymbol y - \boldsymbol X \boldsymbol\theta \right )^T \left ( \boldsymbol y - \boldsymbol X \boldsymbol\theta \right )
\end{align}
$$
we compude the gradient $L$ with respect to the parameters as:
$$
\begin{align}
\frac{\mathrm{d} \boldsymbol L}{\mathrm{d} \boldsymbol\theta} &= 
-\frac{1}{\sigma^2} \boldsymbol X^T \left ( \boldsymbol y -  \boldsymbol X \boldsymbol\theta \right ) \\
&= -\frac{1}{\sigma^2} (\boldsymbol X^T \boldsymbol y - \boldsymbol X^T \boldsymbol X \boldsymbol\theta)
\end{align}
$$
now we need to solve $\frac{\mathrm{d} \boldsymbol L}{\mathrm{d} \boldsymbol\theta} = \boldsymbol 0$
$$
\begin{align}
\boldsymbol X^T \boldsymbol X \boldsymbol\theta &= \boldsymbol X^T \boldsymbol y \\
\boldsymbol\theta &= (\boldsymbol X^T \boldsymbol X)^{-1} \boldsymbol X^T \boldsymbol y
\end{align}

In [24]:
# maximun likelihood estimation
MLE = torch.inverse(X.T @ X) @ X.T @ Y
print(MLE)

tensor([[ 7.0000],
        [-3.9999],
        [-5.0001]])
