In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
tX.shape

(250000, 30)

## Cleaning data set

In [4]:
from helpers import *

def clean_data(tx):
    nbrRows = tx.shape[0]
    nbrColunms = tx.shape[1]
    tx_temp = np.zeros((nbrRows,nbrColunms))
    
    for columnID in range(nbrColunms):
        currentColumn = tx[:,columnID]

        # extract indices with -999 values
        nanIndices = np.where(currentColumn == -999)
        tempColumm = np.delete(currentColumn, nanIndices, axis=0)

        # replace -999 values with median
        median = np.median(tempColumm)
        currentColumn[nanIndices] = median
        
        tx_temp[:,columnID] = currentColumn
        
    return tx_temp

In [5]:
tX_cleaned = clean_data(tX)

# normalized data set
tx_stand, mean_training, std_training = standardize(tX_cleaned)

tx_stand.shape

(250000, 31)

# Implementation of ML methods

## Linear regression - gradient descent

In [12]:
from costs import *

def compute_gradient(y, tx, w):
    # error
    e = y - tx.dot(w)
    
    # gradient 
    N=y.shape[0]
    gradient = - np.transpose(tx).dot(e)/N
    
    return gradient

In [7]:
def least_squares_GD(y, tx, w_init, max_iters, gamma):
    # init parameters
    threshold = 1e-8
    ws = [w_init]
    w_temp = w_init
    losses = []
    
    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)
        
        # update w by gradient
        w_temp -= gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
        # converge criteria
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break

    return ws[-1], losses[-1]

### Training

In [8]:
w_init_GD = np.zeros(tx.shape[1])
max_iters_GD = 5000
gamma_GD = 1.0e-4

weights_GD, loss_GD = least_squares_GD(y, tx_stand, w_init_GD, max_iters_GD, gamma_GD)

## Linear regression - stochastic gradient descent

In [9]:
def compute_stoch_gradient(y, tx, w):
    B = 2500 # size of the batch
    sum = 0
    for minibatch_y, minibatch_tx in batch_iter(y, tx, B):
        sum += compute_gradient(minibatch_y, minibatch_tx, w)

    return sum / B

In [10]:
def least_squares_SGD(y, tx, w_init, max_iters, gamma):    
    # init parameters
    threshold = 1e-8
    ws = [w_init]
    w_temp = w_init
    losses = []

    for n_iter in range(max_iters):
        # compute gradient and loss
        grad = compute_stoch_gradient(y, tx, w_temp)
        loss = compute_loss(y, tx, w_temp)

        # update w by gradient
        w_temp -= gamma*grad
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
    # converge criteria
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
        
    return ws[-1], losses[-1]

### Training

In [11]:
w_init_SGD = np.zeros(tx.shape[1])
max_iters_SGD = 1000
gamma_SGD = 1.0e-4

weights_SGD, loss_SGD = least_squares_SGD(y, tx_stand, w_init_SGD, max_iters_SGD, gamma_SGD)
print("\nweights_SGD:\n",weights_SGD,"\n")


weights_SGD:
 [ -1.25614455e-03   9.97150823e-05  -1.32815288e-03  -5.27928916e-05
   7.22658605e-04   7.66857221e-04   7.99408456e-04  -6.91391788e-04
   5.06184197e-05  -6.05347686e-05   5.73463931e-04  -7.38693028e-04
   1.02477617e-03   6.60875202e-04   8.87155547e-04  -3.60393141e-06
  -1.67007998e-05  -1.21742592e-04   5.65540330e-06   1.56312950e-05
   8.12748175e-05   2.82100202e-05   5.06601596e-04   5.00013130e-04
   4.31305971e-04   2.69747806e-07   3.38680916e-06   8.25647673e-05
   2.18994847e-06  -1.32808328e-05   5.01517082e-04] 



## Least squares

In [12]:
def least_squares(y, tx):
    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = tx_transpose.dot(tx)
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
        
    return w_opt, loss # returns loss, and optimal weights

### Training

In [13]:
weights_LeastS, loss_LeastS = least_squares(y, tx_stand)

print("\nweights_LeastS:\n",weights_LeastS,"\n")


weights_LeastS:
 [ -3.14664000e-01   1.22272262e-02  -2.53066586e-01  -2.63456197e-01
   1.29243443e-02   1.92805216e-02   1.04114629e-01   7.14391075e-03
   2.80223055e-01  -2.77613512e-02  -3.20624220e+02  -1.87675501e-01
   1.20153160e-01   7.45926202e-02   6.22887642e+01  -8.00284516e-04
  -8.11540969e-04   6.14298446e+01  -6.52146078e-04   2.55788779e-03
   1.00389486e-01   9.43326693e-04  -4.78438667e-02   5.46733536e-02
  -3.73249556e-02   5.84626551e-04   2.49121190e-04  -1.82119811e-02
   1.51617614e-03  -1.61520956e-03   2.71558175e+02] 



## Ridge regression

In [91]:
def build_poly(x, degree):
    x_poly = np.ones(np.shape(x))
    for i in range(1,degree+1):
        x_poly = np.c_[x_poly,np.power(x,i)]

    return x_poly;

"""def build_poly(x, degree):
    # Initialization variables
    poly = np.zeros(shape=(len(x),degree+1))
    
    # for set of date
    for m in range(0,len(x)):
        for j in range(0,degree+1):
            poly[m,j] = np.power(x[m],j)
    
    return poly"""

def powerize(x, degree):
    """Returns x concatenated with x ** 2, ..., x ** degree"""
    return x if degree == 1 else np.append(powerize(x, degree - 1), x ** degree, axis = 1)


In [97]:
test_poly1 = [[1,2,3],[1,2,3]]
degree = 1
t1 = build_poly(test_poly1,degree)
print(t1)
print(t1.shape)
print(t1[1])

[[ 1.  1.  1.  1.  2.  3.]
 [ 1.  1.  1.  1.  2.  3.]]
(2, 6)
[ 1.  1.  1.  1.  2.  3.]


In [99]:
def ridge_regression(y, tx, lambda_):    
    # Initiation variables
    lamb_ = 2*len(y)*lambda_
    degree = 1
    tx = build_poly(tx, degree)
    y = build_poly(y, degree)
    # Compute optimum weight
    tx_transpose = np.transpose(tx)
    A = np.dot(tx_transpose,tx) + lamb_*np.eye(tx.shape[1])
    b = tx_transpose.dot(y)
    w_opt = np.linalg.solve(A,b)
    
    print(w_opt.shape)
    # Compute loss
    loss = compute_loss(y, tx, w_opt)
    
    return w_opt, loss # returns mse, and optimal weights

### Training

In [100]:
lambda_RR = 5

weights_RR, loss_RR = ridge_regression(y, tx_stand, lambda_RR)
print("\nweights_RR:\n",weights_RR,"\n")

(62, 2)

weights_RR:
 [[  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-02  -7.49200000e-03]
 [  2.38095238e-

## Logistic regression

In [7]:
def sigmoid(t):
    temp = 1+np.exp(-t)
    return 1/(temp)

In [8]:
def learning_by_gradient_descent(y, tx, w, gamma, lambda_):
    # compute the loss
    N = tx.shape[0]
    l1 = tx.dot(w) + np.log(np.ones((N)) + np.exp(-tx.dot(w)))
    l2 = y*(tx.dot(w))
    penalization = lambda_*np.sum(np.power(w,2))
    loss = np.sum(l1-l2) + penalization
    
    # compute the gradient
    grad = np.transpose(tx).dot(sigmoid(tx.dot(w))-y) + 2*lambda_*w
    
    # update w
    w -= gamma*grad

    return loss, w

In [9]:
def logistic_regression(y, tx, w_init, max_iters, gamma):
    # init parameters
    threshold = 1e-8
    w_temp = w_init
    ws = [w_temp]
    losses = []
    
    B = 2500 # size of the batch
    for iter in range(max_iters):
        for minibatch_y, minibatch_tx in batch_iter(y, tx, B):
            # get loss and update w.
            loss, w_temp = learning_by_gradient_descent(minibatch_y, minibatch_tx, w_temp, gamma, 0)
        
            # store w and loss
            ws.append(np.copy(w_temp))
            losses.append(loss)
        
        # converge criteria
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    
    return ws[-1], losses[-1] 

### Training

In [10]:
w_init_LogR = np.zeros(tx.shape[1])
max_iters_LogR = 500
gamma_LogR = 1.0e-6
method = 5

weights_LogR, loss_LogR = logistic_regression(y, tx_stand, w_init_LogR, max_iters_LogR, gamma_LogR)
print("\nweights_LogR:\n",weights_LogR,"\n")


weights_LogR:
 [ -5.71203383e+01   3.60089409e-01  -2.61930983e+01  -1.52607609e+00
   7.05848950e+00   4.49822622e+00   4.92873370e+00  -3.48625645e+00
   9.27213274e+00  -2.59156871e+00   2.51904510e+00  -1.20089815e+01
   1.55182076e+01   5.55970340e+00   9.97406999e+00  -1.06822036e-01
  -3.08794326e-01  -1.62030238e+00   3.21303217e-02   1.60721047e-01
  -3.76264205e+00   2.93163991e-01   1.65820287e+00   2.05389627e+00
   1.01793765e+00  -2.71988340e-03   5.84716972e-02  -2.34544664e+00
   7.16842755e-02  -1.06352458e-01   1.05780219e+00] 



## Regularized logistic regression

In [11]:
def reg_logistic_regression(y, tx, lambda_, w_init, max_iters, gamma):
    # init parameters
    threshold = 1e-8
    w_temp = w_init
    ws = [w_temp]
    losses = []

    # start the logistic regression
    for iter in range(max_iters):        
        # get loss and update w.
        loss, w_temp = learning_by_gradient_descent(y, tx, w_temp, gamma, lambda_)
        
        # store w and loss
        ws.append(np.copy(w_temp))
        losses.append(loss)
        
        # converge criteria
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    
    return ws[-1], losses[-1]

### Training

In [12]:
w_init_RLogR = np.zeros(tx.shape[1])
lambda_RLogR = 2
max_iters_RLogR = 500
gamma_RLogR = 1.0e-8

weights_RLogR, loss_RLogR = reg_logistic_regression(y, tx_stand, lambda_RLogR, w_init_RLogR, max_iters_RLogR, gamma_RLogR)
print("\nweights_RLogR:\n",weights_RLogR,"\n")


weights_RLogR:
 [ -8.87363844e-01   2.33289146e-02  -3.23104890e-01  -1.08212377e-02
   1.24384373e-01   1.54545187e-01   1.51458461e-01  -1.32593818e-01
   6.21212881e-02  -4.28607043e-02   7.96062498e-02  -1.78485966e-01
   2.23050317e-01   1.42699610e-01   1.96549318e-01  -1.35876693e-03
  -4.65627158e-03  -3.41985829e-02   5.75460347e-04   4.12388662e-03
  -1.71403322e-02   6.26476528e-03   6.36197412e-02   6.83834733e-02
   5.23010331e-02   8.47420324e-06   5.95999971e-04  -2.01911815e-02
   9.29715304e-04  -2.74421555e-03   5.67301324e-02] 



# Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tx_test_stand, mean_test, std_test = standardize(tX_test, mean_training, std_training)

In [None]:
print(tx_test_stand.shape)

In [None]:
OUTPUT_PATH = '../data/dataSubmission_GD.csv' 
y_pred_GD = predict_labels(weights_GD, tx_test_stand)
create_csv_submission(ids_test, y_pred_GD, OUTPUT_PATH)

In [None]:
OUTPUT_PATH = '../data/dataSubmission_SGD.csv' 
y_pred_SGD = predict_labels(weights_SGD, tx_test_stand)
create_csv_submission(ids_test, y_pred_SGD, OUTPUT_PATH)

In [None]:
OUTPUT_PATH = '../data/dataSubmission_LS.csv' 
y_pred_LS = predict_labels(weights_LeastS, tx_test_stand)
create_csv_submission(ids_test, y_pred_LS, OUTPUT_PATH)

In [None]:
OUTPUT_PATH = '../data/dataSubmission_RR.csv' 
y_pred_RR = predict_labels(weights_RR, tx_test_stand)
create_csv_submission(ids_test, y_pred_RR, OUTPUT_PATH)

In [None]:
OUTPUT_PATH = '../data/dataSubmission_LogR.csv' 
y_pred_LogR = predict_labels(weights_LogR, tx_test_stand)
create_csv_submission(ids_test, y_pred_LogR, OUTPUT_PATH)

In [None]:
OUTPUT_PATH = '../data/dataSubmission_RLogR.csv' 
y_pred_RLogR = predict_labels(weights_RLogR_RLogR, tx_test_stand)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)