In [17]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Cross-Validation and Bias-Variance decomposition
## Cross-Validation
Implementing 4-fold cross-validation below:

In [2]:
from helpers import load_data

# load dataset
x, y = load_data()

In [3]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [15]:
from costs import compute_mse
from ridge_regression import ridge_regression
from build_polynomial import build_poly
from split_data import split_data

def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train:
    x_test = x[k_indices[k]]
    y_test = x[k_indices[k]]
    tr_indices = k_indices[:].pop(k_indices[k])
    print(tr_indices)
    x_tr = x[k_indices[:k]] + x[k_indices[(k + 1):]]
    y_tr = y[k_indices[:k]] + y[k_indices[(k + 1):]]
    
    # form train and test data with polynomial basis function
    poly_x_tr = build_poly(x_tr, degree)
    poly_x_test = build_poly(x_test, degree)
    
    # calcualte weight through least square.
    weight_tr, mse_tr = ridge_regression(y_tr, poly_x_tr, lambda_)
    weight_te, mse_test = ridge_regression(y_test, poly_x_test, lambda_)
    
    # calculate loss for train and test data,
    # and store them in rmse_tr and rmse_te respectively
    loss_tr = np.sqrt(2*mse_tr)
    loss_te = np.sqrt(2*mse_test)
    
    return loss_tr, loss_te

In [16]:
from plots import cross_validation_visualization

def cross_validation_demo():
    seed = 1
    degree = 7
    k_fold = 4
    lambdas = np.logspace(-4, 2, 30)
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    
    # cross validation:
    for lamb in (lambdas):
        temp_rmse_tr = []
        temp_rmse_te = []
        for k in range(k_fold):
            tp_rmse_tr, tp_rmse_te = cross_validation(y, x, k_indices, k, lamb, degree)
            temp_rmse_tr.append(tp_rmse_tr)
            temp_rmse_te.append(tp_rmse_te)
        rmse_tr.append( np.mean(temp_rmse_tr) )
        rmse_te.append( np.mean(temp_rmse_te) )

    cross_validation_visualization(lambdas, rmse_tr, rmse_te)

cross_validation_demo()

AttributeError: 'numpy.ndarray' object has no attribute 'pop'

## Bias-Variance Decomposition
Visualize bias-variance trade-off by implementing the function `bias_variance_demo()` below:

In [None]:
from least_squares import least_squares
from split_data import split_data
from plots import bias_variance_decomposition_visualization

def bias_variance_demo():
    """The entry."""
    # define parameters
    seeds = range(100)
    num_data = 10000
    ratio_train = 0.005
    degrees = range(1, 10)
    
    # define list to store the variable
    rmse_tr = np.empty((len(seeds), len(degrees)))
    rmse_te = np.empty((len(seeds), len(degrees)))
    
    for index_seed, seed in enumerate(seeds):
        np.random.seed(seed)
        x = np.linspace(0.1, 2 * np.pi, num_data)
        y = np.sin(x) + 0.3 * np.random.randn(num_data).T
        # split data with a specific seed: TODO
        x_tr, x_test, y_tr, y_test = split_data(x, y, seed)
        
        raise NotImplementedError
        # ***************************************************
        # INSERT YOUR CODE HERE
        # bias_variance_decomposition: TODO
        # ***************************************************
        raise NotImplementedError

    bias_variance_decomposition_visualization(degrees, rmse_tr, rmse_te)

bias_variance_demo()