In [1]:
import numpy as np
from sklearn.model_selection import LeaveOneOut, KFold

# 6.S077 Problem Set 4

## Problem 4-3

Parameters for generated data.

In [2]:
num_trials = 1000
n = 100
p = 50

Latent $\beta$ parameter.

In [3]:
beta = np.linspace(0.01, 0.50, p).reshape(-1, 1)

Covariate matrix $\mathbb{X}$.

In [4]:
def gen_XX(n, p):
    """Computes the covariate matrix of $n$ samples each with $p$ features.
    All are drawn from the standard normal distribution.
    
    Args:
        n (int): Number of samples to generate.
        p (int): Number of features for the samples.
        
    Returns:
        ndarray: Covariate matrix $n$ by $p$ with standard normal components.
        
    """
    return np.random.randn(n, p)

Observations $y$ using the covariate matrix $\mathbb{X}$ and latent $\beta$.

In [5]:
def gen_y(XX, beta):
    """Using the specified covariate matrix and $\beta$ parameter, 
    generate some target values. The noise has variance $\lVert \beta
    \rVert_2^2 / 2$.
    
    Args:
        XX (ndarray): An $n$ by $p$ covariate matrix of $x$ values.
        beta (ndarray): A $p$ by $1$ vector of $\beta$ parameters.
        
    Returns:
        ndarray: A $n$ by $1$ vector of targets with noise.
    
    """    
    n, _ = XX.shape
    
    return XX @ beta + np.sqrt(beta.T @ beta / 2) * np.random.randn(n, 1)

### Part A

Learn the $\hat{\beta}$ associated with the generated data using linear regression.

$$ \hat{\beta} = \left(\mathbb{X}^T \mathbb{X}\right)^{-1} \mathbb{X}^T y $$

In [6]:
def fit_beta_hat(XX, y):
    """Solve the normal equations for the $\hat{\beta}$ that minimizes
    the least squares of the provided values.
    
    Args:
        XX (ndarray): An $n$ by $p$ matrix of observations.
        y (ndarray): An $n$ by $1$ vector of target values.
        
    Returns:
        ndarray: A $p$ by $1$ vector representing $\hat{\beta}$.
        
    """
    return np.linalg.solve(XX.T @ XX, XX.T @ y)

Training error for the given $\hat{\beta}$.

$$ \frac{1}{n}\sum_{i = 1}^n\left(y_i - x_i^T \hat{\beta}\right)^2 = \frac{1}{n} \left(y - \mathbb{X} \hat{\beta}\right)^T \left(y - \mathbb{X} \hat{\beta}\right) $$

In [7]:
def get_train_err(beta_hat, XX, y):
    """Computes the training error for the specified $\hat{\beta}$
    on the provided data.
    
    Args:
        beta_hat (ndarray): A $p$ by $1$ vector of a $\hat{\beta}$ to
            compute the error.
        XX (ndarray): An $n$ by $p$ matrix of observations.
        y (ndarray): An $n$ by $1$ vector of target values.
    
    Returns:
        float: The error associated with the $\hat{\beta}$ for the
            provided data.
        
    """
    n, _ = y.shape
    
    return np.asscalar((1 / n) * (y - XX @ beta_hat).T @ (y - XX @ beta_hat))

Run for several trials.

$$ \mathbb{E}\left[\frac{1}{n}\sum_{i = 1}^n\left(y_i - x_i^T \hat{\beta}\right)^2 \right] = \mathbb{E}\left[\frac{1}{n} \left(y - \mathbb{X} \hat{\beta}\right)^T \left(y - \mathbb{X} \hat{\beta}\right)\right]$$

In [8]:
def avg_train_err(gen_XX, gen_y, num_trials):
    """Repeatedly calculates the mean and standard deviation of training 
    error over several trials by solving for a $\hat{\beta}$ on many
    different generated samples.
    
    Args:
        gen_XX (func): A function with no parameters returning a set of
            observations to train on.
        gen_y (func): A function with parameter XX which determines the
            matching targets for the generated XX samples.
        num_trials (int): Number of times to repeat evaluation.
        
    Returns:
        float: Mean of all training errors across trials.
        float: Standard deviation of training errors across trials.
    
    """    
    train_err = np.zeros((num_trials, 1))
    
    for i in range(num_trials):

        XX = gen_XX()
        y = gen_y(XX)
    
        beta_hat = fit_beta_hat(XX, y)
        
        train_err[i, 0] = get_train_err(beta_hat, XX, y)
    
    return train_err.mean(), train_err.std()

In [9]:
avg_train_err(lambda: gen_XX(n, p), lambda XX: gen_y(XX, beta), num_trials)

(1.0663375694690895, 0.21246183638826555)

### Part B

Generate a single new sample $\tilde{x} \in \mathbb{R}^p$.

$$ \tilde{x}_i \sim \mathcal{N}\left(0, 1\right) $$

In [10]:
def gen_x_tilde(p):
    """Generates a new sample from a standard normal distribution
    with $p$ features.
    
    Args:
        p (int): Number of features for the sample drawn.
    
    Returns:
        ndarray: A sample from $\mathbb{R}^P$ with each component 
            drawn from the standard normal distribution.
    
    """
    return np.random.randn(p, 1)

Testing error for a single sample $\tilde{x}$.

$$ \mathbb{E}\left[\left(\tilde{x}^T \beta - \tilde{x}^T \hat{\beta}\right)^2\right] + \sigma^2 = \left(\tilde{x}^T \beta - \tilde{x}^T \hat{\beta}\right)^T \left(\tilde{x}^T \beta - \tilde{x}^T \hat{\beta}\right) + \frac{\beta^T \beta}{2} $$

In [11]:
def get_test_err(x_tilde, beta, beta_hat):
    """Calculate the actual test error for a new sample given the
    $\hat{\beta}$ and the actual $\beta$.
    
    Args:
        x_tilde (ndarray): A $p$ by $1$ vector to compute the error for
        beta (ndarray): A $p$ by $1$ vector of actual $\beta$.
        beta_hat (ndarray): A $p$ by $1$ vector of estimated $\hat{\beta}$
    
    Returns:
        float: Actual error for new test point on estimated $\hat{\beta}$
            given the actual $\beta$.
            
    """
    return np.asscalar((x_tilde.T @ beta - x_tilde.T @ beta_hat).T @ (x_tilde.T @ beta - x_tilde.T @ beta_hat) + (beta.T @ beta) / 2)

Run for several trials.

$$ \mathbb{E}\left[\mathbb{E}\left[\left(\tilde{x}^T \beta - \tilde{x}^T \hat{\beta}\right)^2\right] + \sigma^2\right] = \mathbb{E}\left[\left(\tilde{x}^T \beta - \tilde{x}^T \hat{\beta}\right)^T \left(\tilde{x}^T \beta - \tilde{x}^T \hat{\beta}\right) + \frac{\beta^T \beta}{2}\right]$$

In [12]:
def avg_test_err(gen_XX, gen_y, gen_x_tilde, beta, num_trials):
    """Repeatedly computes the actual test error for a new data
    point drawn from the original distribution for a set number
    of trials.
    
    Args:
        gen_XX (func): A function with no parameters returning a matrix
            of observations to train on.
        gen_y (func): A function with parameter XX which determines the
            matching targets for the generated XX samples.
        gen_x_tilde (func): A function with no parameters returning a 
            new sample point to test error of.
        num_trials (int): Number of times to repeat evaluation.
        
    Returns:
        float: Mean of all actual test errors across all trials.
        float: Standard deviation of actual test errors across all trials.
    
    """   
    test_err = np.zeros((num_trials, 1))
    
    for i in range(num_trials):
        
        XX = gen_XX()
        y = gen_y(XX)
        
        beta_hat = fit_beta_hat(XX, y)
        
        x_tilde = gen_x_tilde()
        
        test_err[i, 0] = get_test_err(x_tilde, beta, beta_hat)
        
    return test_err.mean(), test_err.std()

In [13]:
avg_test_err(lambda: gen_XX(n, p), lambda XX: gen_y(XX, beta), lambda: gen_x_tilde(p), beta, num_trials)

(4.311799299124214, 3.2253003882322964)

Using the relation specified in the problem, test error is

$$ \lVert \beta - \hat{\beta} \rVert_2^2 + \sigma^2 = \left(\beta - \hat{\beta}\right)^T \left(\beta - \hat{\beta}\right) + \frac{\beta^T \beta}{2} $$

In [14]:
def get_closed_form_test_err(beta, beta_hat):
    """Computes the actual test error between estimated
    $\hat{\beta}$ and actual $\beta$ without a new sample
    drawn.
    
    Args:
        beta (ndarray): A $p$ by $1$ vector of actual $\beta$.
        beta_hat (ndarray): A $p$ by $1$ vector of estimated $\hat{\beta}$
    
    Returns:
        float: Actual error of estimated $\hat{\beta}$ given the actual $\beta$.
    
    """
    return np.asscalar((beta - beta_hat).T @ (beta - beta_hat) + (beta.T @ beta) / 2)

Run for several trials.

$$ \mathbb{E}\left[\lVert \beta - \hat{\beta} \rVert_2^2 + \sigma^2\right] = \mathbb{E}\left[\left(\beta - \hat{\beta}\right)^T \left(\beta - \hat{\beta}\right) + \frac{\beta^T \beta}{2}\right] $$

In [15]:
def avg_closed_form_test_err(gen_XX, gen_y, beta, num_trials):
    """Repeatedly computes the actual test error for an estimated
    $\hat{\beta}$ using closed form which doesn't need a new point
    to be drawn from the distribution.
    
    Args:
        gen_XX (func): A function with no parameters returning a matrix
            of observations to train on.
        gen_y (func): A function with parameter XX which determines the
            matching targets for the generated XX samples.
        num_trials (int): Number of times to repeat evaluation.
        
    Returns:
        float: Mean of all actual test errors across all trials.
        float: Standard deviation of actual test errors across all trials.
    
    """   
    test_err = np.zeros((num_trials, 1))
    
    for i in range(num_trials):
        
        XX = gen_XX()
        y = gen_y(XX)
        
        beta_hat = fit_beta_hat(XX, y)
        
        test_err[i, 0] = get_closed_form_test_err(beta, beta_hat)
        
    return test_err.mean(), test_err.std()

In [16]:
avg_closed_form_test_err(lambda: gen_XX(n, p), lambda XX: gen_y(XX, beta), beta, num_trials)

(4.316051607216384, 0.6362305558502975)

### Part C

First, we can compute the leave one out cross validation naively fitting several $\hat{\beta}$.

$$ \frac{1}{n} \sum_{i = 1}^n \left(y_i - \hat{\hat{y}}_i\right)^2 = \frac{1}{n} \left(y - \hat{\hat{y}}\right)^T \left(y - \hat{\hat{y}}\right) $$

Where $\hat{\hat{y}}_i$ is the predition at $x_i$ using the $\hat{\beta}'$ fitted without $x_i$.

In [17]:
def get_loocv_n(XX, y):
    """Naively computes the leave one out cross valiation error
    by repeatedly training a linear regression model and testing
    on the left out point.
    
    Args:
        XX (ndarray): An $n$ by $p$ matrix of observations to
            run cross validation on.
        y (ndarray): An $n$ by $1$ vector of targets to consider.
        
    Returns:
        float: Leave one out cross validation error for the provided
            data computed naively.
            
    """
    n, _ = XX.shape
    
    y_hat_hat = np.zeros((n, 1))
    
    for i, (train_ind, test_ind) in enumerate(LeaveOneOut().split(XX)):
        
        beta_hat_prime = fit_beta_hat(XX[train_ind], y[train_ind])
        
        y_hat_hat[i:i + 1, :] = XX[test_ind] @ beta_hat_prime
        
    return np.asscalar((1 / n) * (y - y_hat_hat).T @ (y - y_hat_hat))

Repeated for several trials.

$$ \mathbb{E}\left[\frac{1}{n} \sum_{i = 1}^n \left(y_i - \hat{\hat{y}}_i\right)^2\right] = \mathbb{E}\left[\frac{1}{n} \left(y - \hat{\hat{y}}\right)^T \left(y - \hat{\hat{y}}\right)\right] $$

In [18]:
def avg_loocv_n(gen_XX, gen_y, num_trials):
    """Repeatedly computes the LOOCV error by generating
    new data sets and using the naive implementation of
    LOOCV and reports mean and standard deviation.
    
    Args:
        gen_XX (func): A function with no parameters returning a matrix
            of observations to train on.
        gen_y (func): A function with parameter XX which determines the
            matching targets for the generated XX samples.
        num_trials (int): Number of times to repeat evaluation.
    
    Returns:
        float: Mean of LOOCV errors across all trials.
        float: Standard deviation of LOOCV across all trials.
    
    """    
    loocv_n = np.zeros((num_trials, 1))
    
    for i in range(num_trials):
        
        XX = gen_XX()
        y = gen_y(XX)
        
        loocv_n[i, 0] = get_loocv_n(XX, y)
    
    return loocv_n.mean(), loocv_n.std()

In [19]:
avg_loocv_n(lambda: gen_XX(n, p), lambda XX: gen_y(XX, beta), num_trials)

(4.336055794101643, 0.8821864752999772)

We can compute this more quickly using the closed form definition of leave one out cross validation provided in the problem set,

$$ \mathrm{LOOCV}_n = \frac{1}{n} \sum_{i = 1}^n \left(\frac{y_i - \hat{y}_i}{h}\right)^2 = \frac{1}{n} \left(h^{-1}\left(y - \mathbb{X} \hat{\beta}\right)\right)^T \left(h^{-1}\left(y - \mathbb{X} \hat{\beta}\right)\right) $$

Where $h$ is defined as,
$$ h_i = 1 - x_i \left(\mathbb{X}^T \mathbb{X}\right)^{-1} x_i^T $$
$$ h = \left(1 - \mathbb{X} \left(\mathbb{X}^T \mathbb{X}\right)^{-1} \mathbb{X}^T\right) I $$

In [20]:
def get_closed_form_loocv_n(XX, y, beta_hat):
    """Used the known closed form of LOOCV for linear 
    regression by reusing existing $\hat{\beta}$
    
    Args:
        XX (ndarray): An $n$ by $p$ matrix of observations to
            run cross validation on.
        y (ndarray): An $n$ by $1$ vector of targets to consider.
        beta_hat (ndarray): A $p$ by $1$ vector representing 
            $\hat{\beta}$ calculated from the provided data.
        
    Returns:
        float: LOOCV error of data and $\hat{\beta}$ using closed 
            form.
    
    """
    n, _ = XX.shape
    
    h = (1 - XX @ np.linalg.inv(XX.T @ XX) @ XX.T) * np.eye(n)
    
    return np.asscalar((1 / n) * (np.linalg.inv(h) @ (y - XX @ beta_hat)).T @ (np.linalg.inv(h) @ (y - XX @ beta_hat)))

Repeated for several trials.

$$ \mathbb{E}\left[\frac{1}{n} \sum_{i = 1}^n \left(\frac{y_i - \hat{y}_i}{h}\right)^2\right] = \mathbb{E}\left[\frac{1}{n} \left(h^{-1}\left(y - \mathbb{X} \hat{\beta}\right)\right)^T \left(h^{-1}\left(y - \mathbb{X} \hat{\beta}\right)\right)\right] $$

In [21]:
def avg_closed_form_loocv_n(gen_XX, gen_y, num_trials):
    """Repeatedly computes the LOOCV error by generating
    new data sets and using the closed form implementation of
    LOOCV and reports mean and standard deviation.
    
    Args:
        gen_XX (func): A function with no parameters returning a matrix
            of observations to train on.
        gen_y (func): A function with parameter XX which determines the
            matching targets for the generated XX samples.
        num_trials (int): Number of times to repeat evaluation.
    
    Returns:
        float: Mean of LOOCV errors across all trials.
        float: Standard deviation of LOOCV across all trials.
    
    """    
    loocv_n = np.zeros((num_trials, 1))
    
    for i in range(num_trials):
        
        XX = gen_XX()
        y = gen_y(XX)
        
        beta_hat = fit_beta_hat(XX, y)
        
        loocv_n[i, 0] = get_closed_form_loocv_n(XX, y, beta_hat)
    
    return loocv_n.mean(), loocv_n.std()

In [22]:
avg_closed_form_loocv_n(lambda: gen_XX(n, p), lambda XX: gen_y(XX, beta), num_trials)

(4.348439837327746, 0.8622992847454757)

### Part D

Since there is no reasonable closed form of the $k$-fold cross validation, we implement naively.

In [23]:
def get_k_fold_cv(XX, y, k):
    """Computes the k-fold cross validation error on the
    provided data.
    
    Args:
        XX (ndarray): An $n$ by $p$ matrix of observations to
            run cross validation on.
        y (ndarray): An $n$ by $1$ vector of targets to consider.
        k (int): Number of splits to make in the data.
        
    Returns:
        float: k-fold cross validation error over the provided data.
        
    """
    error = 0
    
    for i, (train_ind, test_ind) in enumerate(KFold(n_splits=k).split(XX)):
        
        beta_hat_prime = fit_beta_hat(XX[train_ind], y[train_ind])
        
        error += np.mean((y[test_ind] - XX[test_ind] @ beta_hat_prime) ** 2)
        
    return error / k

Repeating several times.

In [24]:
def avg_k_fold_cv(gen_XX, gen_y, k, num_trials):
    """Repeatedly computes the k-fold cross validation error 
    by generating new data sets and reports mean and standard 
    deviation.
    
    Args:
        gen_XX (func): A function with no parameters returning a matrix
            of observations to train on.
        gen_y (func): A function with parameter XX which determines the
            matching targets for the generated XX samples.
        k (int): Number of splits to make in the data.
        num_trials (int): Number of times to repeat evaluation.
    
    Returns:
        float: Mean of k-fold cross validation errors across all trials.
        float: Standard deviation of k-fold cross validation errors 
            across all trials.
    
    """    
    
    k_fold_cv = np.zeros((num_trials, 1))
    
    for i in range(num_trials):
        
        XX = gen_XX()
        y = gen_y(XX)
        
        k_fold_cv[i, 0] = get_k_fold_cv(XX, y, k)
    
    return k_fold_cv.mean(), k_fold_cv.std()

In [25]:
avg_k_fold_cv(lambda: gen_XX(n, p), lambda XX: gen_y(XX, beta), 10, num_trials)

(4.905681067377833, 1.0904601109970298)