In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
tX.shape

(250000, 30)

### Standardize training data

In [4]:
# Don't use it! It deacrease the performance 
from helpers import *

tX_norm, mean_, std_ = standardize(tX)

### Cleaning data set

In [22]:
def clean_data(tx):
    nbrRows = tx.shape[0]
    nbrColunms = tx.shape[1]
    tx_cleaned = np.zeros((nbrRows,nbrColunms))
    
    for columnID in range(nbrColunms):
        currentColumn = tx[:,columnID]

        # replace -999 values with mean of the rest
        nanIndices = np.where(currentColumn == -999)
        tempColumm = np.delete(currentColumn, nanIndices, axis=0)
        mean = np.mean(tempColumm)
        currentColumn[nanIndices] = mean

        # normalization of the data set
        #norm=np.linalg.norm(currentColumn)
        #tx_cleaned[:,columnID] = currentColumn/norm
        
    return tx_cleaned

In [23]:
tX_cleaned = clean_data(tX)

## Implementation of ML methods

### Linear regression - gradient descent

In [7]:
from costs import *

def compute_gradient(y, tx, w):
    # error
    e = y - tx.dot(w)
    
    # gradient
    gradient = - np.mean(np.transpose(tx).dot(e))
    
    return gradient

In [8]:
def least_squares_GD(y, tx, gamma, max_iters):
    # Define parameters to store w and loss
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = []
    
    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)
        
        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)

    return losses, ws

### Linear regression - stochastic gradient descent

In [9]:
def compute_stoch_gradient(y, tx, w):
    B = 35 # size of the batch
    sum = 0
    for minibatch_y, minibatch_tx in batch_iter(y, tx, B):
        sum += compute_gradient(minibatch_y, minibatch_tx, w)

    return sum / B

In [10]:
def least_squares_SGD(y, tx, gamma, max_iters):    
    # Define parameters to store w and loss
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = []

    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_stoch_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)

        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
    return losses, ws

### Least squares

In [11]:
def least_squares(y, tx):
    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = tx_transpose.dot(tx)
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
        
    return loss, w_opt # returns loss, and optimal weights

### Ridge regression

In [12]:
def ridge_regression(y, tx, lambda_):    
    # Initiation variables
    lamb_ = 2*len(y)*lambda_ 

    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = np.dot(tx_transpose,tx) + lamb_*np.eye(tx.shape[1])
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
    
    return loss, w_opt # returns mse, and optimal weights

### Logistic regression

In [13]:
def sigmoid(t):
    return 1/(1+np.exp(-t))

In [14]:
def learning_by_gradient_descent(y, tx, w, gamma):
    # compute the loss
    N = tx.shape[0]
    l1 = tx.dot(w) + np.log(np.ones((N))+np.exp(-tx.dot(w)))
    l2 = y*(tx.dot(w))
    loss = (np.ones((1,N)).dot(l1-l2))[0]
    
    # compute the gradient
    grad = np.transpose(tx).dot(sigmoid(tx.dot(w))-y)
    
    # update w
    w = w - gamma*grad

    return loss, w

In [15]:
def logistic_regression(y, tx, gamma, max_iters):
    print("\n\n\nlogistic_regression(y, tx, gamma, max_iters):\n")

    # init parameters
    threshold = 1e-8
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = [8000]

    # start the logistic regression
    for iter in range(max_iters):        
        # get loss and update w.
        loss, w_temp = learning_by_gradient_descent(y, tx, w_temp, gamma)
        
        # log info
        if iter % 1000 == 0:
            print("Current iteration={i}:\nLoss={l}\nw={w}\n".format(i=iter, l=loss, w=ws[-1]))
            
       
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
        # converge criteria
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    
    return losses, ws

### Regularized logistic regression

### Cross-validation

In [16]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = len(y)
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    
    return np.array(k_indices)


def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train:
    x_test = x[k_indices[k]]
    y_test = y[k_indices[k]]
    
    tr_indices = np.delete(k_indices, k, axis=0)
    x_tr = np.delete(x, k, axis=0)
    y_tr = np.delete(y, k, axis=0)
    
    # form train and test data with polynomial basis function
    poly_x_tr = build_poly(x_tr, degree)
    poly_x_test = build_poly(x_test, degree)
    
    # calcualte weight and loss through least square.
    loss_tr, weight_tr = ridge_regression(y_tr, poly_x_tr, lambda_)
    loss_test, weight_te = ridge_regression(y_test, poly_x_test, lambda_)
    
    return loss_tr, loss_te, weight_tr, weight_te

### Bias-Variance Decomposition

# Training

In [17]:
# Linear regression - gradient descent
max_iters_GD = 100
gamma_GD = 1.0e-8
loss_GD, weights_GD = least_squares_GD(y, tX_cleaned, gamma_GD, max_iters_GD)
print("loss_GD={loss}\n\n".format(loss=loss_GD))
print("weights_GD={w}".format(w=weights_GD[-1]))

loss_GD=[0.5, 0.49999999427601749, 0.49999998855203615, 0.49999998282805563, 0.49999997710407629, 0.49999997138009794, 0.4999999656561207, 0.49999995993214458, 0.49999995420816917, 0.49999994848419493, 0.49999994276022192, 0.49999993703624973, 0.49999993131227854, 0.49999992558830864, 0.49999991986433956, 0.49999991414037154, 0.49999990841640457, 0.49999990269243855, 0.49999989696847363, 0.49999989124450978, 0.49999988552054697, 0.499999879796585, 0.49999987407262447, 0.49999986834866461, 0.49999986262470586, 0.49999985690074816, 0.49999985117679147, 0.49999984545283588, 0.49999983972888118, 0.4999998340049277, 0.49999982828097517, 0.4999998225570238, 0.49999981683307332, 0.49999981110912384, 0.49999980538517524, 0.49999979966122798, 0.49999979393728167, 0.49999978821333624, 0.49999978248939198, 0.49999977676544882, 0.49999977104150656, 0.49999976531756535, 0.49999975959362525, 0.49999975386968604, 0.49999974814574799, 0.49999974242181089, 0.4999997366978749, 0.4999997309739399, 0.4999

In [18]:
# Linear regression - stochastic gradient descent
max_iters_SGD = 100
gamma_SGD = 1.0e-8
loss_SGD, weights_SGD = least_squares_SGD(y, tX_cleaned, gamma_SGD, max_iters_SGD)
print("loss_SGD={loss}\n".format(loss=loss_SGD))
print("weights_SGD={w}".format(w=weights_SGD[-1]))

loss_SGD=[0.5, 0.49999999983645765, 0.49999999967291525, 0.49999999950937302, 0.4999999993458305, 0.49999999918228827, 0.49999999901874581, 0.49999999885520358, 0.49999999869166112, 0.49999999852811883, 0.49999999836457648, 0.49999999820103391, 0.49999999803749173, 0.49999999787394933, 0.49999999771040704, 0.49999999754686464, 0.49999999738332229, 0.49999999721977983, 0.49999999705623766, 0.49999999689269531, 0.49999999672915285, 0.49999999656561062, 0.49999999640206821, 0.49999999623852587, 0.49999999607498347, 0.49999999591144123, 0.49999999574789877, 0.49999999558435654, 0.49999999542081414, 0.49999999525727173, 0.4999999950937295, 0.49999999493018704, 0.49999999476664492, 0.49999999460310246, 0.49999999443956006, 0.49999999427601782, 0.49999999411247537, 0.49999999394893313, 0.49999999378539067, 0.49999999362184855, 0.49999999345830609, 0.49999999329476391, 0.49999999313122145, 0.49999999296767911, 0.49999999280413682, 0.49999999264059453, 0.49999999247705212, 0.49999999231350989, 

In [19]:
# Least squares
loss_LeastS, weights_LeastS = least_squares(y, tX_cleaned)
print("loss_LeastS={loss}\n".format(loss=loss_LeastS))
print("weights_LeastS={w}".format(w=weights_LeastS))

loss_LeastS=0.3438151967517036

weights_LeastS=[  2.73031424e+00  -2.29747827e+02  -2.54770832e+02  -2.24815706e+01
  -5.77338234e+01   1.10828828e+02  -1.61103684e+01   3.71264149e+02
  -2.03574598e+01  -2.73887296e+05  -2.07311312e+02   5.89734344e+01
   8.75575679e+01   6.25977052e+04  -2.71874704e-01  -6.71795197e-01
   7.23979718e+04  -6.94764531e-01   1.12218635e+00   8.28082429e+01
   4.98401250e-01  -6.28121902e+01  -1.97014413e+02  -2.57065452e+02
   3.23642173e-01   2.59443567e-01  -2.93845037e+02   8.23643682e-01
  -1.05994110e+00   1.70976515e+05]


In [20]:
# Ridge regression
lambda_RR = 2
loss_RR, weights_RR = ridge_regression(y, tX_cleaned, lambda_RR)
print("loss_RR={loss}\n".format(loss=loss_RR))
print("weights_RR={w}".format(w=weights_RR))

loss_RR=0.49999873083700275

weights_RR=[ -1.42310341e-04  -2.25070869e-04  -1.43552430e-04  -3.82624819e-05
  -1.14573968e-04  -9.39628425e-05  -1.09352936e-05  -1.47588356e-04
  -1.07377301e-04  -8.41617898e-05  -1.82627085e-04   1.45050432e-04
  -1.07448053e-04  -8.02112838e-05   9.74267298e-07  -1.38182658e-06
  -1.48710861e-04   3.14544453e-06  -1.81264161e-06  -1.16941295e-04
   4.42624474e-06  -1.01521866e-04  -6.65704885e-05  -1.17474574e-04
   4.89414315e-07   1.30381096e-06  -1.52481609e-04   2.02551398e-06
  -1.43140608e-06  -4.29280668e-05]


In [21]:
# Logistic regression
max_iters_LogR = 1000
gamma_LogR = 1.0e-8
loss_LogR, weights_LogR = logistic_regression(y, tX_cleaned, gamma_LogR, max_iters_LogR)
print("loss_LogR={loss}\n".format(loss=loss_LogR))
print("weights_LogR={w}".format(w=weights_LogR[-1]))




logistic_regression(y, tx, gamma, max_iters):

Current iteration=0:
Loss=173286.79513997794
w=[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

loss_LogR=[8000, 173286.79513997794, 173286.77399203292, 173286.75284408103, 173286.73169613071, 173286.71054818176, 173286.68940023438, 173286.66825228845, 173286.64710434401, 173286.62595640111, 173286.60480845967, 173286.58366051968, 173286.56251258121, 173286.54136464425, 173286.52021670877, 173286.49906877487, 173286.47792084233, 173286.45677291133, 173286.43562498182, 173286.41447705386, 173286.39332912728, 173286.37218120223, 173286.35103327874, 173286.32988535665, 173286.30873743611, 173286.2875895171, 173286.26644159947, 173286.24529368334, 173286.22414576885, 173286.20299785567, 173286.18184994417, 173286.16070203402, 173286.13955412543, 173286.11840621824, 173286.09725831266, 173286.07611040852, 173286.05496250585, 173286.03381460466, 173286.01266670501, 173

In [27]:
## Regularized logistic regression

## Generate predictions and save ouput in csv format for submission:

In [19]:
DATA_TEST_PATH = '../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [43]:
OUTPUT_PATH = '../data/dataSubmission_RR.csv' 
y_pred = predict_labels(weights_RR, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [40]:
OUTPUT_PATH = '../data/dataSubmission_LogR.csv' 
y_pred = predict_labels(weights_LogR[1000], tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)