In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [12]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
y.shape

(250000,)

# Data preprocessing

There are 30 features, 250'000 samples, column 22 is categorical

## Split data set into training/validation set

In [13]:
from data_preprocessing import *
y_tr, x_tr, y_te, x_te = split_data(tX, y, 0.8, seed=20)

In [14]:
len(y_tr)

200000

**Categorical handling**

In [15]:
undefined_features = [[4, 5, 6, 12, 22, 23, 24, 25, 26,
                       27, 28, 29], [4, 5, 6, 12, 22, 26, 27, 28], [22], [22]]
PRI_jet_num = 22

Get jet sets for training data

In [16]:
jet_train, y_jet_train, index = get_jets(x_tr, y_tr, PRI_jet_num, undefined_features, list_ = True)

jet tot size: 200000  y tot size:  200000


Get jet sets for validation set

In [17]:
jet_test, y_jet_test, index_te = get_jets(x_te, y_te, PRI_jet_num, undefined_features, list_ = True)

jet tot size: 50000  y tot size:  50000


http://opendata.cern.ch/record/328, https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

**Removing other -999 (non defined) entries, replacing them and standardizing data**

Deal with nan !!
https://towardsdatascience.com/working-with-missing-data-in-machine-learning-9c0a430df4ce

In [18]:
x_tr = standard(x_tr)
x_tr.shape

(200000, 30)

In [19]:
check1 = 0
check2 = 0

for jet in jet_train:
    jet_ = standard(jet)
    jet = jet_
    check1 += len(jet_)
for jet in jet_test:
    jet_ = standard(jet)
    jet = jet_
    check2 += len(jet_)

print(check1, check2, check1 + check2)


79792
62185
40362
17661
20121
15359
10017
4503
200000 50000 250000


In [23]:
np.argwhere(np.isnan(jet_test[2]))

array([], shape=(0, 2), dtype=int64)

## Different splitting are possible
- simple splitting
- **k-folds** for each method. k = 10 ?
- train / test / validation set ? -> Do we keep a test set
- train with entire set ?

![How to proceed](train_test_valid_split_process_matrix.png)


The process can be summarised as follows:

1) Separate out from the data a final holdout testing set (perhaps something like ~10% if we have a good amount of data).

2) Shuffle the remaining data randomly.

3) Split this data into k equally sized sets/folds.

4) For each unique fold:
- 4.1 Use this fold as the validation fold
- 4.2 Combine the other k-1 folds as the training data
- 4.3 Fit the model with the training data
- 4.4 Evaluate the model with the validation fold
- 4.5 Keep the evaluation scores, discard the model and begin again at 4.1 with a new validation fold

5) Evaluate your model against the whole set of k validation scores, and if you are unhappy make adjustments and repeat from 1.

When you are finally happy, combine all k folds into one complete training data set, train again, and perform a final test on the holdout testing set.

https://algotrading101.com/learn/train-test-split-2/

**Questions**:

- Do I have to split first my data into test / training set and perform cross validation on training test...?
- Do I have to optimize for gamma ? NO
- Which k to choose for k folds cross validation?
-- Common value is k = 10. For statistical reasons (https://machinelearningmastery.com/k-fold-cross-validation/#:~:text=When%20a%20specific%20value%20for,learning%20model%20on%20unseen%20data.)

## Do your thing crazy machine learning thing here :) ...

### 1) Basic least_squares Gradient descent:
- don't forget to optimize
- for gamma: The most commonly used rates are : 0.001, 0.003, 0.01, 0.03, 0.1, 0.3. for degree 1 (https://towardsdatascience.com/gradient-descent-algorithm-and-its-variants-10f652806a3)
- across k-folds -> 10-folds (for both cases)
- for degrees (to test) across different gammas. Does it make any sense...?
- across seeds...?

### For entire standardized set

Bad idea

## For all jets

**Perform least_squares GD for degree 1**

In [24]:
from implementations import least_squares_GD
from utils import build_poly

y_pred_list = []
w_list = []
rmse_list = []
degree = 1

for (jet, y_jet) in zip(jet_train, y_jet_train):
    
    x_augm = build_poly(jet,1)
    initial_w = np.zeros((x_augm.shape[1],))
    rmse, w = least_squares_GD(y_jet, x_augm, initial_w, 5000, 0.000001)
    print(rmse)
    w_list.append(w)
    rmse_list.append(rmse)

for (jet_te, w) in zip(jet_test, w_list):
    x_augm = build_poly(jet_te, degree)
    y_pred = x_augm.dot(w)
    
    y_pred_list.append(y_pred)

0.7734318646156526
0.8940274323040908
0.8776259621297081
0.8703956788129524


**Compute accuracy on test set**

In [25]:
from proj1_helpers import combine_jets
from utils import accuracy_2

y_predict = combine_jets(y_pred_list, index_te)
accuracy_2(y_te, y_predict)

0.74302

final accuracy = 0.74302


### Submission file

#### Load data

In [104]:
DATA_TEST_PATH = 'test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

**Load jets**

In [105]:
from proj1_helpers import get_jets_final
jet_final, ind_list = get_jets_final(tX_test, 22, undefined_features)

**Compute predictions**

In [106]:
final_jet = []

for (jet, w) in zip(jet_final, w_list):
    x_augm = build_poly(jet, 1)
    y_pred = x_augm.dot(w)
    final_jet.append(y_pred)

**Compute final y**

In [107]:
y_final = combine_jets(final_jet, ind_list)

In [109]:
print(len(y_final), tX_test.shape[0])
Y = predict_labels_2(y_final)
final_check = np.abs(Y)
print(np.sum(final_check), len(y_final))

568238 568238
568238.0 568238


**Create submission file**

In [110]:
from utils import predict_labels_2
Y = predict_labels_2(y_final)
OUTPUT_PATH = 'GD_deg1.csv'
create_csv_submission(ids_test, Y, OUTPUT_PATH)

VErdict: accuracy = 0.743, F1 score = 	0.566
SO test set is quite accurate :)

### 2) Ridge regression

- optimize over lambda for degree 1
- select best_degree over all lambdas

**2.1) Grid search: Optimize for degree** across lambda (few) range, seed = 2020

In [None]:
from cross_validation_phi import select_best_degree_ridge
from implementations import ridge_regression

best_degs = []
best_lbds = []
rmse_tot_te = []
w_list = []
y_pred_list = []

for (jet, y_jet) in zip(jet_train, y_jet_train):
    best_degree, rmse_te, best_lambda = select_best_degree_ridge(y_jet, jet, seed = 200, k_fold = 10)
    best_degs.append(best_degree)
    best_lbds.append(best_lambda)
    rmse_tot_te.append(rmse_te)
    
    x_augm = build_poly(jet, best_degree)
    rmse_tr, w = ridge_regression(y_jet, x_augm, best_lambda)
    w_list.append(w)
    
for(jet, w, deg) in zip(jet_test, w_list, best_degs):
    x_augm_te = build_poly(jet, deg)
    y_pred = x_augm_te.dot(w)
    y_pred_list.append(y_pred)

Best degree =1, loss for k-folds cross validation=0.7391695016746421, best lambda=1.0


In [130]:
y_predict = combine_jets(y_pred_list, index_te)
print(accuracy_2(y_te, y_predict))

0.7903


accuracy = 0.7903

## Generate submission file:

### Load test data for final submission

In [None]:
DATA_TEST_PATH = 'test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

### Split test set into jets and get prediction

In [121]:
from proj1_helpers import get_jets_final
jet_final, ind_list = get_jets_final(tX_test, 22, undefined_features)

In [123]:
final_jet = []

for (jet, w, deg) in zip(jet_final, w_list, best_degs):
    x_augm = build_poly(jet, deg )
    y_pred = x_augm.dot(w)
    final_jet.append(y_pred)

**Compute predictions**

In [124]:
y_final = combine_jets(final_jet, ind_list)
print(len(y_final), tX_test.shape[0])
Y = predict_labels_2(y_final)
final_check = np.abs(Y)
print(np.sum(final_check), len(y_final))

568238 568238
568238.0 568238


**Create submission file**

In [126]:
OUTPUT_PATH = 'Ridge.csv'
create_csv_submission(ids_test, Y, OUTPUT_PATH)

accuracy = 0.790, Fscore = 0.662