In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [21]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

(250000,)

In [22]:
tX.shape

(250000, 30)

## Standardize training data

In [3]:
# Don't use it! It deacrease the performance 
from helpers import *

tX_norm, mean_, std_ = standardize(tX)

## Implementation of ML methods

### Linear regression - gradient descent

In [4]:
from costs import *

def compute_gradient(y, tx, w):
    # error
    e = y - tx.dot(w)
    
    # gradient
    gradient = - np.mean(np.transpose(tx).dot(e))
    
    return gradient

In [5]:
def least_squares_GD(y, tx, gamma, max_iters):
    # Define parameters to store w and loss
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = []
    
    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)
        
        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)

    return losses, ws

### Linear regression - stochastic gradient descent

In [6]:
def compute_stoch_gradient(y, tx, w):
    B = 35 # size of the batch
    sum = 0
    for minibatch_y, minibatch_tx in batch_iter(y, tx, B):
        sum += compute_gradient(minibatch_y, minibatch_tx, w)

    return sum / B

In [7]:
def least_squares_SGD(y, tx, gamma, max_iters):    
    # Define parameters to store w and loss
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = []

    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_stoch_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)

        # update w by gradient
        w_temp = w_temp - gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
    return losses, ws

### Least squares

In [8]:
def least_squares(y, tx):
    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = tx_transpose.dot(tx)
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
        
    return loss, w_opt # returns loss, and optimal weights

### Ridge regression

In [9]:
def ridge_regression(y, tx, lambda_):    
    # Initiation variables
    lamb_ = 2*len(y)*lambda_ 

    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = np.dot(tx_transpose,tx) + lamb_*np.eye(tx.shape[1])
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
    
    return loss, w_opt # returns mse, and optimal weights

### Logistic regression

In [10]:
def sigmoid(t):
    return 1/(1+np.exp(-t))

In [11]:
def learning_by_gradient_descent(y, tx, w, gamma):
    # compute the loss
    N = tx.shape[0]
    l1 = tx.dot(w) + np.log(np.ones((N))+np.exp(-tx.dot(w)))
    l2 = y*(tx.dot(w))
    loss = (np.ones((1,N)).dot(l1-l2))[0]
    
    # compute the gradient
    grad = np.transpose(tx).dot(sigmoid(tx.dot(w))-y)
    
    # update w
    w = w - gamma*grad

    return loss, w

In [12]:
def logistic_regression(y, tx, gamma, max_iters):
    print("\n\n\nlogistic_regression(y, tx, gamma, max_iters):\n")

    # init parameters
    threshold = 1e-8
    w_temp = np.zeros(tx.shape[1]) # initialization of the weight
    ws = [w_temp]
    losses = [8000]

    # start the logistic regression
    for iter in range(max_iters):        
        # get loss and update w.
        loss, w_temp = learning_by_gradient_descent(y, tx, w_temp, gamma)
        
        # log info
        if iter % 1000 == 0:
            print("Current iteration={i}:\nLoss={l}\nw={w}\n".format(i=iter, l=loss, w=ws[-1]))
            
       
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
        # converge criteria
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    
    return losses, ws

### Regularized logistic regression

### Cross-validation

In [13]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = len(y)
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    
    return np.array(k_indices)


def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train:
    x_test = x[k_indices[k]]
    y_test = y[k_indices[k]]
    
    tr_indices = np.delete(k_indices, k, axis=0)
    x_tr = np.delete(x, k, axis=0)
    y_tr = np.delete(y, k, axis=0)
    
    # form train and test data with polynomial basis function
    poly_x_tr = build_poly(x_tr, degree)
    poly_x_test = build_poly(x_test, degree)
    
    # calcualte weight and loss through least square.
    loss_tr, weight_tr = ridge_regression(y_tr, poly_x_tr, lambda_)
    loss_test, weight_te = ridge_regression(y_test, poly_x_test, lambda_)
    
    return loss_tr, loss_te, weight_tr, weight_te

### Bias-Variance Decomposition

# Training

In [14]:
# Linear regression - gradient descent
max_iters_GD = 100
gamma_GD = 1.0e-8
loss_GD, weights_GD = least_squares_GD(y, tX_norm, gamma_GD, max_iters_GD)
print("loss_GD={loss}\n\n".format(loss=loss_GD))
print("weights_GD={w}".format(w=weights_GD[-1]))

loss_GD=[0.5, 0.49968396075696725, 0.49937944508339982, 0.49908603280156344, 0.4988033190544382, 0.49853091374708974, 0.49826844100840534, 0.49801553867245951, 0.49777185777878874, 0.49753706209088738, 0.49731082763226198, 0.49709284223939948, 0.49688280513103777, 0.49668042649314109, 0.49648542707900722, 0.49629753782395647, 0.49611649947407044, 0.49594206222846771, 0.49577398539462209, 0.49561203705624979, 0.4954559937533064, 0.49530564017365136, 0.495160768855955, 0.49502117990343891, 0.49488668070805397, 0.49475708568471499, 0.49463221601522583, 0.494511899401543, 0.4943959698280338, 0.49428426733240455, 0.494176637784981, 0.49407293267603669, 0.49397300891087537, 0.49387672861238613, 0.49378395893079735, 0.49369457186036747, 0.49360844406275917, 0.49352545669685527, 0.49344549525477843, 0.49336844940389013, 0.49329421283455294, 0.49322268311343936, 0.49315376154219398, 0.49308735302124673, 0.49302336591859192, 0.49296171194335281, 0.49290230602395557, 0.4928450661907447, 0.4927899

In [15]:
# Linear regression - stochastic gradient descent
max_iters_SGD = 100
gamma_SGD = 1.0e-8
loss_SGD, weights_SGD = least_squares_SGD(y, tX, gamma_SGD, max_iters_SGD)
print("loss_SGD={loss}\n".format(loss=loss_SGD))
print("weights_SGD={w}".format(w=weights_SGD[-1]))

loss_SGD=[0.5, 0.4999908888560281, 0.49998178728952664, 0.49997269529042765, 0.49996361284867463, 0.49995453995422046, 0.49994547659702948, 0.49993642276707628, 0.49992737845434598, 0.49991834364883431, 0.49990931834054736, 0.49990030251950207, 0.49989129617572498, 0.49988229929925437, 0.49987331188013834, 0.49986433390843521, 0.49985536537421399, 0.4998464062675545, 0.49983745657854656, 0.49982851629729053, 0.49981958541389671, 0.49981066391848689, 0.49980175180119213, 0.49979284905215449, 0.49978395566152634, 0.49977507161947016, 0.49976619691615859, 0.49975733154177543, 0.49974847548651402, 0.49973962874057853, 0.49973079129418285, 0.49972196313755141, 0.49971314426091923, 0.49970433465453146, 0.49969553430864289, 0.49968674321351958, 0.49967796135943682, 0.49966918873668104, 0.49966042533554811, 0.49965167114634473, 0.49964292615938705, 0.49963419036500201, 0.49962546375352684, 0.4996167463153085, 0.49960803804070386, 0.49959933892008068, 0.49959064894381633, 0.49958196810229838, 0

In [16]:
# Least squares
loss_LeastS, weights_LeastS = least_squares(y, tX)
print("loss_LeastS={loss}\n".format(loss=loss_LeastS))
print("weights_LeastS={w}".format(w=weights_LeastS))

loss_LeastS=0.3394455984893298

weights_LeastS=[ -3.14664000e-01   2.93788270e-02  -2.52531475e-01  -2.54791124e-01
  -3.03696823e-02  -1.40144743e+00   2.95701642e-01  -1.07889472e+01
   2.67880862e-01  -2.44934976e-03  -3.28818528e+02  -1.82647888e-01
   1.14039627e-01   2.05045963e+01   6.38835904e+01  -3.18961906e-04
  -1.80884296e-03   6.29927695e+01  -4.48641463e-04   1.54379289e-03
   1.21462701e-01   3.95268822e-04  -6.33223472e-02  -2.06747093e-01
  -1.16655769e-01   9.86256395e-02   1.67907714e-01  -3.35146266e-02
  -2.98358689e+00  -5.36388093e+00   2.78471292e+02]


In [41]:
# Ridge regression
lambda_RR = 2
loss_RR, weights_RR = ridge_regression(y, tX, lambda_RR)
print("loss_RR={loss}\n".format(loss=loss_RR))
print("weights_RR={w}".format(w=weights_RR))

loss_RR=0.3549991006182304

weights_RR=[  2.68150720e-04  -9.13263786e-03  -1.92944215e-03  -1.91986925e-03
   5.95207215e-04   5.84831367e-04  -5.45852749e-03   7.83188539e-03
   8.66797629e-05   2.89451390e-03  -7.08966760e-03   1.69641158e-02
   3.09674014e-03   6.39685801e-03  -1.62492159e-04  -7.91331334e-04
   1.90485502e-03  -2.09337083e-04   5.51201056e-04   4.76505676e-03
   2.47039308e-04  -7.62871230e-04  -6.41114885e-03   1.48939737e-03
  -7.43123705e-04  -5.96195117e-04   2.93411957e-04   8.26006211e-04
   2.05161529e-04  -5.40700797e-03]


In [18]:
# Logistic regression
max_iters_LogR = 1000
gamma_LogR = 1.0e-8
loss_LogR, weights_LogR = logistic_regression(y, tX, gamma_LogR, max_iters_LogR)
print("loss_LogR={loss}\n".format(loss=loss_LogR))
print("weights_LogR={w}".format(w=weights_LogR[-1]))




logistic_regression(y, tx, gamma, max_iters):

Current iteration=0:
Loss=173286.79513998912
w=[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

loss_LogR=[8000, 173286.79513998912, 172483.7577171626, 171685.20036042391, 170891.0665574873, 170101.30049701728, 169315.84706532262, 168534.65184280818, 167757.66110019016, 166984.82179449184, 166216.08156482669, 165451.38872798573, 164690.69227383647, 163933.9418605449, 163181.08780963591, 162432.08110089423, 161686.8733671243, 160945.41688877461, 160207.66458843544, 159473.57002522162, 158743.08738904889, 158016.17149481032, 157292.77777646296, 156572.86228103261, 155856.38166254456, 155143.29317588598, 154433.55467060988, 153727.12458468525, 153023.96193820136, 152324.02632703132, 151627.27791646254, 150933.67743479955, 150243.18616694189, 149555.76594794809, 148871.37915658497, 148189.98870887063, 147511.55805161555, 146836.05115596426, 146163.43251094344, 14

In [27]:
## Regularized logistic regression

## Generate predictions and save ouput in csv format for submission:

In [19]:
DATA_TEST_PATH = '../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [43]:
OUTPUT_PATH = '../data/dataSubmission_RR.csv' 
y_pred = predict_labels(weights_RR, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [40]:
OUTPUT_PATH = '../data/dataSubmission_LogR.csv' 
y_pred = predict_labels(weights_LogR[1000], tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)