In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Cross-Validation and Bias-Variance decomposition
## Cross-Validation
Implementing 4-fold cross-validation below:

In [None]:
from helpers import load_data

# load dataset
x, y = load_data()

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [None]:
k = 1
k_fold = 4
size = 10

k_indices = build_k_indices(np.arange(size), k_fold, 1)
print(k_indices)
print(k_indices[np.arange(k_fold) != k].flatten())
print(k_indices[k].flatten())

In [None]:
from costs import compute_mse
from ridge_regression import ridge_regression
from build_polynomial import build_poly

def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in trainP
    idx_tr = k_indices[np.arange(k_fold) != k].flatten()
    idx_te = k_indices[k].flatten()
    
    x_tr = x[idx_tr]
    y_tr = y[idx_tr]
    
    x_te = x[idx_te]
    y_te = y[idx_te]

    # form data with polynomial degree
    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)
    
    # ridge regression
    w = ridge_regression(y_tr, tx_tr, lambda_)

    # calculate the loss for train and test data
    loss_tr = compute_mse(y_tr, tx_tr, w)
    loss_te = compute_mse(y_te, tx_te, w)

    return loss_tr, loss_te

In [None]:
from plots import cross_validation_visualization

def cross_validation_demo():
    seed = 1
    degree = 7
    k_fold = 4
    lambdas = np.logspace(-4, 0, 30)
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    
    # cross validation
    for l in lambdas:
        loss_tr = 0
        loss_te = 0
        for k in range(k_fold):
            loss_tr_k, loss_te_k = cross_validation(y, x, k_indices, k, l, degree)
            loss_tr += loss_tr_k
            loss_te += loss_te_k
        rmse_tr.append(np.sqrt(loss_tr/k_fold))
        rmse_te.append(np.sqrt(loss_te/k_fold))

    cross_validation_visualization(lambdas, rmse_tr, rmse_te)

cross_validation_demo()

## Bias-Variance Decomposition
Visualize bias-variance trade-off by implementing the function `bias_variance_demo()` below:

In [None]:
from least_squares import least_squares
from split_data import split_data
from plots import bias_variance_decomposition_visualization

def bias_variance_demo():
    """The entry."""
    # define parameters
    seeds = range(100)
    num_data = 10000
    ratio_train = 0.005
    degrees = range(1, 10)
    
    # define list to store the variable
    rmse_tr = np.empty((len(seeds), len(degrees)))
    rmse_te = np.empty((len(seeds), len(degrees)))
    
    for index_seed, seed in enumerate(seeds):
        np.random.seed(seed)
        x = np.linspace(0.1, 2 * np.pi, num_data)
        y = np.sin(x) + 0.3 * np.random.randn(num_data).T

        # split data with a specific seed
        x_tr, x_te, y_tr, y_te = split_data(x, y, ratio_train, seed)
        
        # ***************************************************
        # INSERT YOUR CODE HERE
        # bias_variance_decomposition: TODO
        # ***************************************************
        for index_degree, degree in enumerate(degrees):
            tx_tr = build_poly(x_tr, degree)
            tx_te = build_poly(x_te, degree)
            mse_tr, w = least_squares(y_tr, tx_tr)
            mse_te = compute_mse(y_te, tx_te, w)
            rmse_tr[index_seed, index_degree] = np.sqrt(mse_tr)
            rmse_te[index_seed, index_degree] = np.sqrt(mse_te)
        

    bias_variance_decomposition_visualization(degrees, rmse_tr, rmse_te)

bias_variance_demo()