In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Standardize training data

In [3]:
from helpers import *

tX_norm, mean_, std_ = standardize(tX)

## Implementation of ML methods

### Linear regression - gradient descent

In [4]:
from costs import *

def compute_gradient(y, tx, w):
    # error
    e = y - tx.dot(w)
    
    # gradient
    gradient = - np.mean(np.transpose(tx).dot(e))
    
    return gradient

In [5]:
def least_squares_GD(y, tx, gamma, max_iters):
    # Define parameters to store w and loss
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = []
    
    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)
        
        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)

    return losses, ws

### Linear regression - stochastic gradient descent

In [6]:
def compute_stoch_gradient(y, tx, w):
    B = 35 # size of the batch
    sum = 0
    for minibatch_y, minibatch_tx in batch_iter(y, tx, B):
        sum += compute_gradient(minibatch_y, minibatch_tx, w)

    return sum / B

In [7]:
def least_squares_SGD(y, tx, gamma, max_iters):    
    # Define parameters to store w and loss
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = []

    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_stoch_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)

        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
    return losses, ws

### Least squares

In [8]:
def least_squares(y, tx):
    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = tx_transpose.dot(tx)
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
        
    return loss, w_opt # returns loss, and optimal weights

### Ridge regression

In [9]:
def ridge_regression(y, tx, lambda_):    
    # Initiation variables
    lamb_ = 2*len(y)*lambda_ 

    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = np.dot(tx_transpose,tx) + lamb_*np.eye(tx.shape[1])
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
    
    return loss, w_opt # returns mse, and optimal weights

### Logistic regression

In [10]:
def sigmoid(t):
    return 1/(1+np.exp(-t))

In [11]:
def learning_by_gradient_descent(y, tx, w, gamma):
    # compute the loss
    N = tx.shape[0]
    l1 = tx.dot(w) + np.log(np.ones((N))+np.exp(-tx.dot(w)))
    l2 = y*(tx.dot(w))
    loss = (np.ones((1,N)).dot(l1-l2))[0]
    
    # compute the gradient
    grad = np.transpose(tx).dot(sigmoid(tx.dot(w))-y)
    
    # update w
    w = w - gamma*grad

    return loss, w

In [12]:
def logistic_regression(y, tx, gamma, max_iters):
    print("\n\n\nlogistic_regression(y, tx, gamma, max_iters):\n")

    # init parameters
    threshold = 1e-8
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = [8000]

    # start the logistic regression
    for iter in range(max_iters):        
        # get loss and update w.
        loss, w_temp = learning_by_gradient_descent(y, tx, w_temp, gamma)
        
        # log info
        if iter % 1000 == 0:
            print("Current iteration={i}:\nLoss={l}\nw={w}\n".format(i=iter, l=loss, w=ws[-1]))
            
       
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
        # converge criteria
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    
    return losses, ws

### Regularized logistic regression

### Cross-validation

In [13]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = len(y)
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    
    return np.array(k_indices)


def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train:
    x_test = x[k_indices[k]]
    y_test = y[k_indices[k]]
    
    tr_indices = np.delete(k_indices, k, axis=0)
    x_tr = np.delete(x, k, axis=0)
    y_tr = np.delete(y, k, axis=0)
    
    # form train and test data with polynomial basis function
    poly_x_tr = build_poly(x_tr, degree)
    poly_x_test = build_poly(x_test, degree)
    
    # calcualte weight and loss through least square.
    loss_tr, weight_tr = ridge_regression(y_tr, poly_x_tr, lambda_)
    loss_test, weight_te = ridge_regression(y_test, poly_x_test, lambda_)
    
    return loss_tr, loss_te, weight_tr, weight_te

### Bias-Variance Decomposition

# Training

In [14]:
# Linear regression - gradient descent
max_iters_GD = 100
gamma_GD = 1.0e-4
loss_GD, weights_GD = least_squares_GD(y, tX_norm, gamma_GD, max_iters_GD)
print("loss_GD={loss}\n\n".format(loss=loss_GD))
print("weights_GD={w}".format(w=weights_GD[-1]))

loss_GD=[0.5, 0.49509329415905884, 0.49309042431073274, 0.49227287221786792, 0.49193915536397137, 0.491802935373164, 0.49174733169094376, 0.49172463480586753, 0.4917153701569883, 0.491711588416958, 0.49171004474726876, 0.49170941463623768, 0.49170915743101157, 0.49170905244232693, 0.49170900958696168, 0.49170899209381624, 0.49170898495328358, 0.49170898203858726, 0.49170898084883641, 0.49170898036319149, 0.49170898016495607, 0.49170898008403829, 0.49170898005100838, 0.49170898003752589, 0.49170898003202246, 0.49170898002977603, 0.49170898002885916, 0.49170898002848484, 0.49170898002833197, 0.49170898002826963, 0.49170898002824409, 0.49170898002823377, 0.49170898002822955, 0.49170898002822777, 0.49170898002822711, 0.49170898002822683, 0.49170898002822666, 0.49170898002822661, 0.49170898002822661, 0.49170898002822661, 0.49170898002822661, 0.49170898002822661, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822661, 0.49

In [15]:
# Linear regression - stochastic gradient descent
max_iters_SGD = 100
gamma_SGD = 1.0e-2
loss_SGD, weights_SGD = least_squares_SGD(y, tX_norm, gamma_SGD, max_iters_SGD)
print("loss_SGD={loss}\n".format(loss=loss_SGD))
print("weights_SGD={w}".format(w=weights_SGD[-1]))

loss_SGD=[0.5, 0.49171732249588501, 0.49170898842246119, 0.4917089800366729, 0.4917089800282351, 0.49170898002822661, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822661, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 0.49170898002822655, 

In [16]:
# Least squares
loss_LeastS, weights_LeastS = least_squares(y, tX_norm)
print("loss_LeastS={loss}\n".format(loss=loss_LeastS))
print("weights_LeastS={w}".format(w=weights_LeastS))

loss_leastSquares=0.315094589377539

w_leastSquares=[ -3.18637275e-01   4.42477967e-02  -2.93071150e-01  -2.19876743e-01
  -1.95048072e-01  -5.19968012e+01   6.94484146e-01   3.07557459e+00
   1.72252298e-01   1.87348945e-02  -2.06635171e+02  -2.70339176e-01
   1.59392549e-01   9.31526205e+01   4.49106633e+01  -1.55934806e-02
   1.29465979e-02   3.33733454e+01  -8.90097096e-02  -2.05452753e-02
   1.36509627e-01  -3.58709087e-02  -2.13404098e-01  -6.04613938e-01
  -2.68316421e+00   3.78781375e+00  -9.83142262e-01  -8.04545106e+00
  -4.39035145e+00  -3.24591992e+01   1.72466761e+02]


In [17]:
# Ridge regression
lambda_RR = 2
loss_RR, weights_RR = ridge_regression(y, tX_norm, lambda_RR)
print("loss_RR={loss}\n".format(loss=loss_RR))
print("weights_RR={w}".format(w=weights_RR))

loss_RR=0.4250442648890115

weights_RR=[ -6.37274549e-02   3.19514911e-02  -5.80616912e-02  -1.23637621e-03
   2.23939842e-02   1.11413313e-04   7.96656825e-03  -1.30638278e-05
  -2.88271041e-05  -7.47104152e-03   1.64345604e-02  -2.99202754e-02
   3.76612647e-02   7.88161002e-05   2.87391375e-02  -9.33629671e-03
   4.22149589e-03   8.07857456e-03  -1.80461236e-02  -9.87411785e-03
  -1.70482114e-03  -4.86383042e-03   9.14691084e-03   2.03746702e-03
   1.42223114e-02   1.25308371e-02   1.25158912e-02  -2.93414009e-04
   7.29915718e-05   3.39296602e-05   1.07390080e-02]


In [26]:
# Logistic regression
max_iters_LogR = 1000
gamma_LogR = 1.0e-8
loss_LogR, weights_LogR = logistic_regression(y, tX_norm, gamma_LogR, max_iters_LogR)
print("loss_LogR={loss}\n".format(loss=loss_LogR))
print("weights_LogR={w}".format(w=weights_LogR[-1]))




logistic_regression(y, tx, gamma, max_iters):

Current iteration=0:
Loss=345.8804430994126
w=[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

Current iteration=1000:
Loss=342.43580370291346
w=[ -4.08245559e-03   1.10803538e-03  -1.82579302e-03   3.37991356e-05
   1.12279640e-03   5.11829854e-04   6.77697058e-04   5.09122550e-04
  -2.52378885e-04   1.04114418e-04   1.02183898e-03  -9.09029274e-04
   1.42319717e-03   5.11151981e-04   1.12357679e-03  -2.23077243e-04
   6.86488330e-05   2.26556094e-04  -4.22967613e-04  -3.13484077e-04
   1.48236305e-04  -1.09222225e-04   8.59449830e-04   6.68570953e-04
   9.19413214e-04   8.52775278e-04   8.52603790e-04   5.09788203e-04
   5.10939796e-04   5.10320963e-04   8.93710160e-04]

Current iteration=2000:
Loss=339.0316645037215
w=[ -8.15982163e-03   2.21067482e-03  -3.64355568e-03   6.76259165e-05
   2.23376921e-03   1.01169531e-03   1.34348443e-03   1.00628290e-03
  

In [27]:
## Regularized logistic regression

## Generate predictions and save ouput in csv format for submission:

In [20]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [21]:
OUTPUT_PATH = '../data/dataSubmission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

NameError: name 'weights' is not defined