# Finding hyperparametres

### load data

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
y.shape

(250000,)

## 1) Preprocessing step


In [3]:
from data_preprocessing import *
y_tr, x_tr, y_te, x_te = split_data(tX, y, 0.8, seed=20)

In [4]:
undefined_features = [[4, 5, 6, 12, 22, 23, 24, 25, 26,
                       27, 28, 29], [4, 5, 6, 12, 22, 26, 27, 28], [22], [22]]
PRI_jet_num = 22

In [5]:
jet_train, y_jet_train, index = get_jets(x_tr, y_tr, PRI_jet_num, undefined_features, list_ = True)

jet tot size: 200000  y tot size:  200000


In [6]:
jet_test, y_jet_test, index_te = get_jets(x_te, y_te, PRI_jet_num, undefined_features, list_ = True)

jet tot size: 50000  y tot size:  50000


## Standardization / Normalization / removal of correlated features

aller voir dans data_preprocessing
la function **preprocessing_data peut**:
    - normaliser (mettre en 0 et 1 -> ce qui évite la divergence de logistic regression),
    - standardiser (zero mean zero variance)
    - enlever les colonnes corrélées
    -> le faire sur chaque jet et utiliser jet_str_tr and jet_str_te a la place

In [7]:
from data_preprocessing import preprocessing_data

In [8]:
jet_std_tr = []
jet_std_te = []
for jet in jet_train:
    jet_std_tr.append(preprocessing_data(jet, True, False, False))
    
for jet in jet_test:
    jet_std_te.append(preprocessing_data(jet, True, False, False))
    

## Ordre des méthodes

- 1: least squares
- 2: least squqres GD
- 3: LS SGD
- 4: ridge
- 5: log
- 6: regularized log

## 2) Test logistic regression reussi

A tester sur différent degrés et gammas, avec différents types de data preprocessing
degree -> select best degree
w_opt -> utiliser best_w

**Test juste pour voir si ça marche**

In [23]:
from implementations import logistic_regression
from utils import build_poly

loss_tot_te = []
w_list = []

for (jet, y_jet) in zip(jet_std_tr, y_jet_train):
    jet_augm = build_poly(jet, 5)
    w_init = np.zeros((jet_augm.shape[1],))
    w, loss = logistic_regression(y_jet, jet_augm, w_init, 1000, 0.001)
    w_list.append(w)
    loss_tot_te.append(loss)
   
    

**Trouver l'hyperparamètre optimal**

In [13]:
from cross_validation import select_best_gamma
from run import best_w
from utils import build_poly

best_degs = []
best_gammas = []
rmse_tot_te = []
w_list = []

for (jet, y_jet) in zip(jet_std_tr, y_jet_train):
    best_degree, rmse_te, best_gamma, rmse_plot = select_best_gamma(y_jet, jet, 5, k_fold = 5, seed = 200, degrees = np.arange(5,10,1), gammas = np.logspace(-20,-5,1) )
    best_degs.append(best_degree)
    best_gammas.append(best_gamma)
    rmse_tot_te.append(rmse_te)
    #rmse_plot_list.append(rmse_plot)
    
    x_augm = build_poly(jet, best_degree)
    w = best_w(y_jet , jet , 5, 0 , best_degree, best_gamma)
    w_list.append(w)

Best degree =5, loss for k-folds cross validation=0.6931471805599383, best gamma=1e-20
Best degree =5, loss for k-folds cross validation=0.6931471805599427, best gamma=1e-20
Best degree =5, loss for k-folds cross validation=0.6931471805599461, best gamma=1e-20
Best degree =5, loss for k-folds cross validation=0.6931471805599438, best gamma=1e-20


**Compute predictions**

In [18]:
y_pred_list = []
for(jet, w, deg) in zip(jet_std_te, w_list, best_degs):
    x_augm_te = build_poly(jet, deg)
    y_pred = x_augm_te.dot(w)
    y_pred_list.append(y_pred)

**Compute accuracy**

In [27]:
from utils import accuracy_2
from proj1_helpers import combine_jets
y_predict = combine_jets(y_pred_list, index_te)
print(y_predict.shape)
print(accuracy_2(y_te, y_predict))

(50000,)
0.65798


## 3) Test penalized logistic regression

In [None]:
from implementations import reg_logistic_regression
from utils import build_poly, sigmoid

lambda_ = 10
gamma = 0.0001
max_iters = 600
loss_tot_te = []
w_list = []

for (jet, y) in zip(jet_normalized_tr, y_jet_train):
    jetaugm = build_poly(jet, 1)
    w_init = np.zeros(jetaugm.shape[1],)
    w, loss = reg_logistic_regression(y, jetaugm, w_init, gamma, max_iters, lambda_)
    loss_tot_te.append(loss)
    w_list.append(w)


In [None]:
y_pred_list = []
for(jet, w) in zip(jet_normalized_te, w_list):
    x_augm_te = build_poly(jet, 1)
    y_pred = x_augm_te.dot(w)
    y_pred_list.append(y_pred)

In [None]:
from utils import accuracy_2
y_predict = combine_jets(y_pred_list, index_te)
print(y_predict.shape)
print(accuracy_2(y_te, y_predict))

## 4) Ridge regression

- optimize over lambda for degree 1
- select best_degree over all lambdas

Grid search: Optimize for degree** raw data, degrees = np.arange(1,5,1), lambdas = np.logspace(-10,0,1) -> accuracy = 0.79264. Le lancer avec ces paramètres

In [None]:
from cross_validation_R_LSGD import select_best_degree_ridge
from run import best_w
from utils import build_poly

best_degs = []
best_lbds = []
rmse_tot_te = []
w_list = []
y_pred_list = []
rmse_plot_list=[]

for (jet, y_jet) in zip(jet_std_tr, y_jet_train):
    best_degree, rmse_te, best_lambda, rmse_plot = select_best_degree_ridge(y_jet, jet, seed = 200, k_fold = 10)
    best_degs.append(best_degree)
    best_lbds.append(best_lambda)
    rmse_tot_te.append(rmse_te)
    rmse_plot_list.append(rmse_plot)
    
    x_augm = build_poly(jet, best_degree)
    w = best_w(y_jet , jet , 4, best_lambda , best_degree)
    w_list.append(w)

In [None]:
y_pred_list = []
for(jet, w, deg) in zip(jet_std_te, w_list, best_degs):
    x_augm_te = build_poly(jet, deg)
    y_pred = x_augm_te.dot(w)
    y_pred_list.append(y_pred)

In [None]:
from utils import accuracy_2
from proj1_helpers import combine_jets
y_predict = combine_jets(y_pred_list, index_te)
print(accuracy_2(y_te, y_predict))