In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Standardize training data

In [3]:
from helpers import *

tX_norm, mean_, std_ = standardize(tX)

## Implementation of ML methods

### Linear regression - gradient descent

In [4]:
from costs import *

def compute_gradient(y, tx, w):
    # error
    e = y - tx.dot(w)
    # gradient
    gradient=-np.mean(tx.dot(e))
    
    return gradient


def least_squares_GD(y, tx, gamma, max_iters):    
    # Define parameters to store w and loss
    w_temp = np.random.rand(tx.shape[1],1) # initialization of the weight
    ws = [w_temp]
    losses = []
    
    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)
        
        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w))
        losses.append(loss)

    return losses, ws

### Linear regression - stochastic gradient descent

In [5]:
def compute_stoch_gradient(y, tx, w):
    B = 35 # size of the batch
    sum = 0
    for minibatch_y, minibatch_tx in batch_iter(y, tx, B):
        sum += compute_gradient(minibatch_y, minibatch_tx, w)

    return sum / B


def least_squares_SGD(y, tx, gamma, max_iters):
    # Define parameters to store w and loss
    w_temp = np.random.rand(tx.shape[1],1) # initialization of the weight
    ws = [w_temp]
    losses = []

    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_stoch_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)

        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
    return losses, ws

### Least squares

In [6]:
def least_squares(y, tx):
    # Initiation variables
    loss = 0 # loss
    w_opt = [] # optimal weight

    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    invert = np.linalg.inv(tx_transpose.dot(tx))
    w_opt = np.dot(invert.dot(tx_transpose),y)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
    
    return loss, w_opt # returns loss, and optimal weights

### Ridge regression

In [7]:
def ridge_regression(y, tx, lambda_):
    # Initiation variables
    lamb_ = 2*len(y)*lambda_ 

    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    inverse = np.linalg.inv(tx_transpose.dot(tx) + lambda_*np.eye(tx.shape[1]))
    w_opt = np.dot(inverse.dot(tx_transpose),y)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
    
    return loss, w_opt # returns mse, and optimal weights

### Logistic regression

### Regularized logistic regression

## Cross-validation

In [8]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = len(y)
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    
    return np.array(k_indices)


def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train:
    x_test = x[k_indices[k]]
    y_test = x[k_indices[k]]
    
    tr_indices = np.delete(k_indices, k, axis=0)
    x_tr = np.delete(x, k, axis=0)
    y_tr = np.delete(y, k, axis=0)
    
    # form train and test data with polynomial basis function
    poly_x_tr = build_poly(x_tr, degree)
    poly_x_test = build_poly(x_test, degree)
    
    # calcualte weight and loss through least square.
    loss_tr, weight_tr = ridge_regression(y_tr, poly_x_tr, lambda_)
    loss_test, weight_te = ridge_regression(y_test, poly_x_test, lambda_)
    
    return loss_tr, loss_te, weight_tr, weight_te

## Training

In [9]:
'''# Linear regression - gradient descent
max_iters_GD = 10
gamma_GD = 2
loss_GD, w_opt_GD = least_squares_GD(y, tX_norm, gamma_GD, max_iters_GD)

# Linear regression - stochastic gradient descent
max_iters_SGD = 10
gamma_SGD = 2
loss_SGD, w_opt_SGD = least_squares_SGD(y, tX_norm, gamma_SGD, max_iters_SGD)

# Least squares
loss_leastSquares, weights_leastSquares = least_squares(y, tX_norm)

# Ridge regression
lambda_ = 2
loss_ridgeRegression, weights_ridgeRegression = ridge_regression(y, tX_norm, lambda_)

# Logistic regression


# Regularized logistic regression'''

'# Linear regression - gradient descent\nmax_iters_GD = 10\ngamma_GD = 2\nloss_GD, w_opt_GD = least_squares_GD(y, tX_norm, gamma_GD, max_iters_GD)\n\n# Linear regression - stochastic gradient descent\nmax_iters_SGD = 10\ngamma_SGD = 2\nloss_SGD, w_opt_SGD = least_squares_SGD(y, tX_norm, gamma_SGD, max_iters_SGD)\n\n# Least squares\nloss_leastSquares, weights_leastSquares = least_squares(y, tX_norm)\n\n# Ridge regression\nlambda_ = 2\nloss_ridgeRegression, weights_ridgeRegression = ridge_regression(y, tX_norm, lambda_)\n\n# Logistic regression\n\n\n# Regularized logistic regression'

## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '../data/dataSubmission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)