In [6]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
from implementations import least_squares_GD, least_squares_SGD, least_squares, ridge_regression, batch_iter, ridge_GD, ridge_SGD
from preprocessing import standardize_train, standardize_test, add_bias
from losses_gradients import compute_loss_ls, compute_gradient_least_squares, compute_loss_ridge, compute_gradient_ridge
from plots import plot_train_test
import math

## Load data

In [9]:
y_train, input_train, ids_train = load_csv_data('train.csv', sub_sample=False)
y_test, input_test, ids_test = load_csv_data('test.csv', sub_sample=False)

## Pre-processing

In [10]:
# IDEE: on pourrait ausi tenter build_polynomial comme dans les séries si on est motivés

In [11]:
# Standardise data
# Careful to standardize the x_test with the mean and std of x_train
x_train, mean, std = standardize_train(input_train)
x_train = add_bias(x_train)
x_test = standardize_test(input_test, mean, std)
x_test = add_bias(x_test)

## Methods to implement 

In [12]:
# Define the parameters of the algorithm. (probably change afterwards)
max_iters = 200                                          
gamma = 0.1
batch_size = 1

# Initialization
w_initial = np.random.rand(x_train.shape[1])

In [37]:
# For grid search of hyperparameters
num_intervals = 25
gammas = np.linspace(0, 0.90, num_intervals)
lambdas = np.logspace(-4, -0.05, 15)

## Least square 

In [34]:
def ls_gd_hyperparam(gammas):
    loss_test = []
    loss_train = []
    for gamma in gammas:
        w, loss_tr = least_squares_GD(y_train, x_train, w_initial, max_iters, gamma)
        loss_train.append(loss_tr)
        loss_test = compute_loss_ls(y_test, x_test, w)
    return loss_test, loss_train 

In [35]:
def ls_sgd_hyperparam(gammas):
    loss_test = []
    loss_train = []
    for gamma in gammas:
        w, loss_tr = least_squares_SGD(y_train, x_train, w_initial, max_iters, gamma)
        loss_train.append(loss_tr)
        loss_test = compute_loss_ls(y_test, x_test, w)
    return loss_test, loss_train 

In [None]:
loss_test_gd, loss_train_gd = ls_gd_hyperparam(gammas)

In [None]:
plot_train_test(loss_train_gd, loss_test_gd, gammas, "Least squares GD")

In [None]:
# Minimum values for ls_gd
idx = np.argmin(loss_test_gd)
learning_rate = gammas[idx]
ls_gd_loss = min(loss_test_gd)

In [None]:
loss_test_sgd, loss_train_sgd = ls_sgd_hyperparam(gammas)

In [None]:
plot_train_test(loss_train_sgd, loss_test_sgd, gammas, "Least squares SGD")

In [None]:
# Minimum values for ls_sgd
idx = np.argmin(loss_test_sgd)
learning_rate = gammas[idx]
ls_sgd_loss = min(loss_test_sgd)

## Ridge regression

In [30]:
def ridge_sgd_hyperparam(gammas, lambdas):
    loss_train = np.zeros((len(gammas), len(lamdas)))
    loss_test = np.zeros((len(gammas), len(lamdas)))
    for i, gamma in enumerate(gammas):
        for j, lambda_ in enumerate(lambdas):
            w, loss_gamma = ridge_SGD(y_train, x_train, w_initial, max_iters, gamma, lambda_)
            loss_train[i, j] = loss_gamma
            loss_test[i, j] = compute_loss_ridge(y_test, x_test, w, lambda_)
    #min_gamma, min_lambda = np.unravel_index(np.argmin(loss_lamb), loss_lamb.shape)
    return loss_train, loss_test #loss_lamb[min_gamma, min_lambda], gammas[min_gamma], lambdas[min_lambda]

In [31]:
def ridge_gd_hyperparam(gammas, lambdas):
    loss_train = np.zeros((len(gammas), len(lamdas)))
    loss_test = np.zeros((len(gammas), len(lamdas)))
    for i, gamma in enumerate(gammas):
        for j, lambda_ in enumerate(lambdas):
            w, loss_gamma = ridge_GD(y_train, x_train, w_initial, max_iters, gamma, lambda_)
            loss_train[i, j] = loss_gamma
            loss_test[i, j] = compute_loss_ridge(y_test, x_test, w, lambda_)
    return loss_train, loss_test

In [None]:
loss_test_gd, loss_train_gd = ridge_gd_hyperparam(gammas, lambdas)

In [None]:
#TODO plot 2D

In [None]:
# Minimum values for ridge_gd
#(i,j) = np.argmin(loss_test_gd)
#learning_rate = gammas[i]
#best_lambda = lambdas[j]
ridge_gd_loss = min(loss_test_gd)

In [None]:
loss_test_sgd, loss_train_sgd = ridge_sgd_hyperparam(gammas, lambdas)

## Logistic regression
Debugged but not sure from here: Il y aura probablement une correction du labo 5 pour améliorer / vérifier les fonctions de logistic regression

### Il y a un NaN dans la loss à cause d'un outlier. Je sais pas si on choisis d'enlever ou comment traiter. On verra.

In [228]:
def sigmoid(tx, w):
    """Compute sigmoid function"""
    z = np.array(np.exp(-tx.dot(w)))
    return 1./(1 + z)

value = sigmoid(x_train, w_initial)
print(value)

[0.99981039 0.90820056 0.92786083 ... 0.28142199 0.00709906 0.04907172]


In [226]:
def compute_logreg_loss(y, tx, w):  #np.log parce que math.log fonctionne pas.. J'ai toujours pas compris pourquoi.
    """Compute error and gradient of logistic regression"""
    sig = sigmoid(tx, w)
    loss = np.sum((-y * np.log(sig) - (1-y) * np.log(1-sig)), axis = -1)/len(y)
    print(max(sig))
    #a = (-y * np.log(sig) - (1-y) * np.log(1-sig))
    #print(max(a))
    return loss

In [227]:
loss = compute_logreg_loss(y_train, x_train, w_initial)
print("loss is {}".format(loss))

[0.51459227 0.10073464 0.53488351 0.42669814 0.02003378 0.140064
 0.2264286  0.072293   0.49416143 0.6004996  0.14155126 0.00350222
 0.81835964 0.2093738  0.7436304  0.88667924 0.88977018 0.11193385
 0.70287969 0.70028424 0.34716633 0.66417114 0.94782936 0.73171242
 0.5700489  0.53273724 0.82729083 0.43374625 0.09599914 0.22535177
 0.38780099]
1.0
loss is nan


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [119]:
def compute_logreg_grad(y, tx, w):
    """Compute error and gradient of logistic regression""" 
    sig = sigmoid(tx, w)
    err  = sig - y
    grad = tx.T.dot(err)/len(y)
    return grad

In [None]:
grad_lr = compute_logreg_grad(y_train, x_train, w_initial)
print("grad is {} \n of shape {}".format(grad_lr, grad_lr.shape))

In [None]:
gamma = 0.01
def logistic_regression(y, tx, initial_w, max_iters, gamma): #SGD  (GD easy to implement from here)
    """Stochastic Gradient Descent algorithm with logistic regression."""
    """Required by project description"""
    w = initial_w

    for n_iter in range(max_iters):
        for y_batch, tx_batch in batch_iter(y, tx, batch_size=batch_size, num_batches=1):
            # compute a stochastic gradient
            grad = compute_logreg_grad(y_batch, tx_batch, w)
            # update w through the stochastic gradient update
            w = w - gamma * grad
            # compute a stochastic loss
            compute_logreg_loss(y_batch, tx_batch, w)
    return w, loss

In [None]:
w_lr, loss_lr = logistic_regression(y_train, x_train, w_initial, max_iters, gamma)
print("w is {} \n of shape {}".format(w_lr, w_lr.shape))
print("\n loss of ridge_SGD is {}".format(loss_lr))

## Regularized Logistic Regression

In [None]:
def compute_logreg_reg_loss(y, tx, w, lambda_):
    """Compute error and gradient of logistic regression"""
    reg = ( lambda_/(2*len(y)) ) * sum(w**2)
    loss = compute_logreg_loss(y, tx, w) + reg
    return loss

In [None]:
loss_lrr = compute_logreg_reg_loss(y_train, x_train, w_initial, lambda_)
print("loss is {}".format(loss_lrr))

In [None]:
def compute_logreg_reg_grad(y, tx, w, lambda_):
    """Compute error and gradient of logistic regression"""     
    grad = compute_logreg_grad(y, tx, w) 
    reg = (lambda_/len(y)) * w[1:]
    grad[1:] = grad[1:] + reg            
    return grad

In [None]:
grad_lrr = compute_logreg_reg_grad(y_train, x_train, w_initial, lambda_)
print("grad is {} \n of shape {}".format(grad_lrr, grad_lrr.shape))

In [None]:
###########
# La loss de ridge SGD me semble particulièrement haute. A vérifier avec les plots (et/ou code).
gamma = 0.01
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    """Stochastic Gradient Descent algorithm with REGULARIZED logistic regression."""
    """Required by project description"""
    w = initial_w

    for n_iter in range(max_iters):
        for y_batch, tx_batch in batch_iter(y, tx, batch_size=batch_size, num_batches=1):
            # compute a stochastic gradient
            grad = compute_logreg_reg_grad(y_batch, tx_batch, w, lambda_)
            # update w through the stochastic gradient update
            w = w - gamma * grad
            # compute a stochastic loss
            loss = compute_logreg_reg_loss(y_batch, tx_batch, w, lambda_)
    return w_rlr_sgd, loss_rlr_sgd

In [None]:
w_rlr_sgd, loss_rlr_sgd = reg_logistic_regression(y_train, x_train, lambda_, w_initial, max_iters, gamma)
print("w is {} \n of shape {}".format(w_rlr_sgd, w_rlr_sgd.shape))
print("\n loss of ridge_SGD is {}".format(loss_rlr_sgd))

## Choisir méthode:
(D'après moi)
- Pour chaque méthode: faire un plot de test_error (=loss) pour un grand nombre d'iteration (assez pour que on voit que l'erreur commence à remonter si possible).  Il faudrait idealement voir la courbe descendre et monter dans le graphe (underfit à overfit)
- Garder la valeur de n iter ou la test error était minimale (au cas où: risque d'être très long pour ridge si lambda_ trop grand)
- Comparer les loss de chaque méthode à leur n_iter optimale
- La meilleure est celle ou c'est le plus faible

En plus comme ça on aura des beau plots et des arguments pour le rapport.

# Prediction de chaque méthode

In [None]:
#y_pred = predict_labels(weights, data)