# PROJECT 1: Higgs Boson

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Library only used for observing the data in a more visual way.
import pandas as pd

In [60]:
# Imports our methods
import methods as md

from helpers import *

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [62]:
tX_cleaned, medians, means, stds = md.clean_data(tX)
tX_cleaned.shape

ValueError: too many values to unpack (expected 4)

In [8]:
# normalized data set
tX_stand, mean_training, std_training = standardize(tX_cleaned)
tX_stand.shape

(250000, 31)

# Prediction

In [10]:
def split_data(y, x, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    
    # set mask
    ratio = 0.7
    msk = np.random.rand(len(y)) < ratio
    
    # training data set
    x_tr = x[msk]
    y_tr = y[msk]
    
    # test data set
    x_test = x[~msk]
    y_test = y[~msk]
    
    return x_tr, x_test, y_tr, y_test

In [14]:
def prediction(y, tX, gamma, lambda_, max_iters, method):
    # split data
    x_tr, x_test, y_tr, y_test = split_data(y, tX)
    
    # training
    loss = 0
    weights = []
    if method == 1:
        loss, weights = md.least_squares_GD(y_tr, x_tr, gamma, max_iters)
    elif method == 2:
        loss, weights = md.least_squares_SGD(y_tr, x_tr, gamma, max_iters)
    elif method == 3:
        loss, weights = md.least_squares(y_tr, x_tr)
    elif method == 4:
        loss, weights = md.ridge_regression(y_tr, x_tr, lambda_)
    elif method == 5:
        loss, weights = md.logistic_regression(y_tr, x_tr, gamma, max_iters)
    else:
        loss, weights = md.reg_logistic_regression(y_tr, x_tr, lambda_, gamma, max_iters)
        
    # compute prediction
    y_pred = predict_labels(weights, x_test)    
    
    # accuracy of the prediction
    N = y_test.shape[0]
    pred = np.sum(y_pred == y_test)/N
        
    return pred

# Training and testing in local

In [9]:
N = y.shape[0]

y1 = y[:N/2].copy()
y2 = y[N/2:].copy()

tX1 = tX_cleaned[:N/2, :].copy()
tX2 = tX_cleaned[N/2:, :].copy()

w0 = np.ones([1,N/2])
tX1 = np.insert(tX1, 0, w0, axis=1)
tX2 = np.insert(tX2, 0, w0, axis=1)

ids1 = ids[:N/2].copy()
ids2 = ids[N/2:].copy()

  app.launch_new_instance()
  a = empty(shape, dtype, order)


### Linear regression - gradient descent

In [15]:
max_iters_GD_test = 2000
gamma_GD_test = 1.0e-8
method = 1

loss_GD_test, weights_GD_test = md.least_squares_GD(y1, tX1, gamma_GD_test, max_iters_GD_test)
pred_GD_test = prediction(y1, tX1, gamma_GD_test, 0, max_iters_GD_test, method)

print("\nweights_GD:\n", weights_GD_test,"\n")
print("pred_GD = ", pred_GD_test)


weights_GD:
 [ -3.54470631e-06  -3.77995796e-04  -4.07834465e-04  -2.84403949e-04
   7.29105520e-05  -4.05523964e-06   3.90119510e-05  -5.58156576e-06
  -8.41863412e-06  -6.39807247e-05  -1.19121662e-04  -8.12715948e-06
   6.99826358e-06  -8.97777865e-07  -2.55889995e-05  -2.22441277e-08
  -9.61758493e-08  -1.67152057e-04   1.53747843e-07  -8.08538501e-08
  -1.16166926e-04   3.03126011e-07  -3.03064005e-04  -4.06487739e-07
  -1.32458131e-04  -7.73419375e-08   9.22489231e-08  -1.61670240e-04
   6.52819501e-08  -8.90173351e-08   7.36194363e-05] 

pred_GD =  0.657165668663


In [16]:
y_pred_GD_test = predict_labels(weights_GD_test, tX2)
n_correct_GD = np.count_nonzero((y_pred_GD_test - y2) == 0)
perc_correct_GD = n_correct_GD/(N/2) * 100
perc_correct_GD

65.6112

### Linear regression - stochastic gradient descent

In [17]:
max_iters_SGD_test = 500
gamma_SGD_test = 1.0e-8
method = 2

loss_SGD_test, weights_SGD_test = md.least_squares_SGD(y1, tX1, gamma_SGD_test, max_iters_SGD_test)
print("\nweights_SGD:\n", weights_SGD_test,"\n")

pred_SGD_test = prediction(y1, tX1, gamma_SGD_test, 0, max_iters_SGD_test, method)
print("pred_SGD = ", pred_SGD_test)


weights_SGD:
 [ -2.67098353e-05  -4.31735155e-04  -4.64078907e-03  -6.23668959e-04
   1.60196097e-03  -1.78633798e-05   6.00538469e-04  -6.07985820e-05
  -2.96660350e-05  -6.34888970e-04   5.29729827e-04  -9.51717238e-05
   1.07362097e-04  -8.24166309e-07   1.17033232e-03  -1.48473365e-06
  -4.31060619e-06  -1.02153542e-03   1.70310500e-06   9.65104093e-07
  -9.09251607e-04   4.43525146e-06  -6.00991490e-04  -9.96426647e-07
  -9.30440833e-04  -6.11963086e-07   4.41518864e-07  -1.47041694e-03
   5.33445006e-07  -1.70304414e-06   3.80933875e-04] 

pred_SGD =  0.688063872255


In [19]:
y_pred_SGD_test = predict_labels(weights_SGD_test, tX2)
n_correct_SGD = np.count_nonzero((y_pred_SGD_test - y2) == 0)
perc_correct_SGD = n_correct_SGD/(N/2) * 100
perc_correct_SGD

68.976

### Least squares

In [20]:
method = 3

loss_LeastS_test, weights_LeastS_test = md.least_squares(y1, tX1)
print("\nweights_LeastS:\n", weights_LeastS_test,"\n")

pred_LeastS_test = prediction(y1, tX1, 0, 0, 0, method)
print("pred_LeastS = ", pred_LeastS_test)


weights_LeastS:
 [ -1.16074033e+00   1.44296347e-04  -7.13890244e-03  -6.35632610e-03
   1.18983305e-04   1.66537567e-02   4.56475343e-04   2.70567148e-03
   3.54887909e-01  -1.19904231e-03  -4.22515836e+00  -2.21232683e-01
   1.02368577e-01   3.52579962e-01   4.23358426e+00  -3.27685587e-03
  -1.63291643e-03   4.23825620e+00   1.63545099e-03   2.70568868e-04
   3.04571909e-03  -3.81839093e-04  -3.77901455e-04   4.71385100e-02
  -8.23199214e-04   1.15058943e-03  -2.02148685e-03  -1.23809562e-03
  -3.37838817e-04  -3.15646992e-03   4.22484938e+00] 

pred_LeastS =  0.747624750499


In [21]:
y_pred_LS_test = predict_labels(weights_LeastS_test, tX2)
n_correct_LS = np.count_nonzero((y_pred_LS_test - y2) == 0)
perc_correct_LS = n_correct_LS/(N/2) * 100
perc_correct_LS

74.4256

### Ridge regression

In [25]:
degree_RR_test = 12

tX1_poly = md.build_poly(tX1, degree_RR_test)
tX2_poly = md.build_poly(tX2, degree_RR_test)

lambda_RR_test = 5
method = 4

loss_RR_test, weights_RR_test = md.ridge_regression(y1, tX1_poly, lambda_RR_test)
print("\nweights_RR:\n", weights_RR_test,"\n")

pred_RR_test = prediction(y1, tX1_poly, 0, lambda_RR_test, 0, method)
print("pred_RR for degree ", str(degree_RR_test), " = ", pred_RR_test)


weights_RR:
 [ -3.86969953e+08   1.50486309e+10  -1.30113548e+08   2.35420823e+09
  -1.41596648e+10  -3.01106458e+09   1.45020186e+09  -1.16522803e+09
  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05
  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05
  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05
  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05  -1.56390486e-05
  -1.56395411e-05  -1.56395411e-05  -1.56395411e-05  -1.56395411e-05
  -1.56395411e-05  -1.56395411e-05  -1.56395411e-05  -1.56395411e-05
  -2.43006108e-05  -1.01446704e-03  -8.71629450e-04  -2.72787078e-03
  -6.37547391e-05  -1.90323208e-03  -5.98123167e-04   3.98496044e-04
   2.69121805e-04  -3.58214457e-04  -6.39801866e-05   8.66204293e-04
   7.44524620e-04  -5.81688437e-04  -1.52705165e-04  -1.11487588e-04
  -3.12430511e-04   5.23437497e-05   6.86315054e-05  -3.27494985e-03
   6.49369349e-05  -7.92239809e-04  -4.60714929e-05  -1.19120665e-04
  -9.97277553e-05   

In [26]:
y_pred_RR_test = predict_labels(weights_RR_test, tX2_poly)
n_correct_RR = np.count_nonzero((y_pred_RR_test - y2) == 0)
perc_correct_RR = n_correct_RR/(N/2) * 100
perc_correct_RR

81.444

### Logistic regression

In [28]:
max_iters_LogR_test = 2000
gamma_LogR_test = 1.0e-10
method = 5

loss_LogR_test, weights_LogR_test = md.logistic_regression(y1, tX1, gamma_LogR_test, max_iters_LogR_test)
print("\nweights_LogR:\n", weights_LogR_test,"\n")

pred_LogR_test = prediction(y1, tX1, gamma_LogR_test, 0, max_iters_LogR_test, method)
print("pred_LogR = ", pred_LogR_test)


weights_LogR:
 [ -7.93971123e-03  -9.25427112e-01  -6.87095681e-01  -6.61222434e-01
  -1.67225941e-01  -1.29103044e-02  -1.00695125e+00  -5.13309675e-03
  -1.86704410e-02  -1.57261061e-01  -8.31681534e-01  -1.53710195e-02
   8.75789549e-03  -2.72906766e-03  -1.81113730e-01   7.07021183e-06
  -9.30049190e-05  -3.87601218e-01   2.48637077e-04  -2.40084245e-04
  -3.13451695e-01   4.10680696e-04  -1.25211521e+00  -4.67453228e-03
  -4.78044443e-01  -7.96808619e-05   1.75176394e-04  -3.92081664e-01
   1.14565931e-04  -9.08258862e-05  -2.62966530e-01] 

pred_LogR =  0.657165668663


In [None]:
y_pred_LogR_test = predict_labels(weights_LogR_test, tX2)
n_correct_LogR = np.count_nonzero((y_pred_LogR_test - y2) == 0)
perc_correct_LogR = n_correct_LogR/(N/2) * 100
perc_correct_LogR

### Regularized logistic regression

In [None]:
max_iters_RLogR_test = 1000
lambda_RLogR_test = 1.0e-8
gamma_RLogR_test = 1.0e-10
method = 5

loss_RLogR_test, weights_RLogR_test = md.reg_logistic_regression(y1, tX1, gamma_RLogR_test, lambda_RLogR_test, max_iters_RLogR_test)
print("\nweights_RLogR:\n", weights_RLogR_test,"\n")

pred_RLogR_test = prediction(y1, tX1, gamma_RLogR_test, lambda_RLogR_test, max_iters_RLogR_test, 6)
print("pred_RLogR = ", pred_RLogR_test)

In [None]:
y_pred_RLogR_test = predict_labels(weights_RLogR_test, tX2)
n_correct_RLogR = np.count_nonzero((y_pred_RLogR_test - y2) == 0)
perc_correct_RLogR = n_correct_RLogR/(N/2) * 100
perc_correct_RLogR

# Training on the entire data set, separating data by JET values

### Dividing by JET

In [32]:
tX_jet0, indexes_jet0, tX_jet1, indexes_jet1, tX_jet2, indexes_jet2, tX_jet3, indexes_jet3 = md.separating_by_jet(tX)
y_jet0 = y[indexes_jet0]
y_jet1 = y[indexes_jet1]
y_jet2 = y[indexes_jet2]
y_jet3 = y[indexes_jet3]
tX_cleaned_jet0 = md.clean_data(tX_jet0)
tX_cleaned_jet1 = md.clean_data(tX_jet1)
tX_cleaned_jet2 = md.clean_data(tX_jet2)
tX_cleaned_jet3 = md.clean_data(tX_jet3)



In [33]:
tX_cleaned_jet0.shape

(99913, 20)

In [34]:
tX_cleaned_jet1.shape

(77544, 23)

In [35]:
tX_cleaned_jet2.shape

(50379, 30)

In [36]:
tX_cleaned_jet3.shape

(22164, 30)

### Training

### JET == 0.0

In [39]:
lambda_jet0 = 6e-10
method = 4

degree_jet0 = 3

tX_reformed_jet0 = tX_cleaned_jet0.copy()
w0 = np.ones([1,tX_reformed_jet0.shape[0]])
tX_reformed_jet0 = np.insert(tX_reformed_jet0, 0, w0, axis=1)

tX_poly_jet0 = md.build_poly(tX_reformed_jet0, degree_jet0)

loss_jet0, weights_jet0 = md.ridge_regression(y_jet0, tX_poly_jet0, lambda_jet0)
# print("\nweights_RR:\n", weights_jet0,"\n")

pred_jet0 = prediction(y_jet0, tX_poly_jet0, 0, lambda_jet0, 0, method)
print("pred_RR = ", pred_jet0)

pred_RR =  0.830314776449


### JET == 1.0

In [43]:
lambda_jet1 = 3e-10
method = 4

degree_jet1 = 9

tX_reformed_jet1 = tX_cleaned_jet1.copy()
w0 = np.ones([1,tX_reformed_jet1.shape[0]])
tX_reformed_jet1 = np.insert(tX_reformed_jet1, 0, w0, axis=1)

tX_poly_jet1 = md.build_poly(tX_reformed_jet1, degree_jet1)

loss_jet1, weights_jet1 = md.ridge_regression(y_jet1, tX_poly_jet1, lambda_jet1)
# print("\nweights_RR:\n", weights_jet1,"\n")

pred_jet1 = prediction(y_jet1, tX_poly_jet1, 0, lambda_jet1, 0, method)
print("pred_RR = ", pred_jet1)

pred_RR =  0.800585878603


### JET == 2.0

In [42]:
lambda_jet2 = 8e-10
method = 4

degree_jet2 = 9

tX_reformed_jet2 = tX_cleaned_jet2.copy()
w0 = np.ones([1,tX_reformed_jet2.shape[0]])
tX_reformed_jet2 = np.insert(tX_reformed_jet2, 0, w0, axis=1)

tX_poly_jet2 = md.build_poly(tX_reformed_jet2, degree_jet2)

loss_jet2, weights_jet2 = md.ridge_regression(y_jet2, tX_poly_jet2, lambda_jet2)
# print("\nweights_RR:\n", weights_jet2,"\n")

pred_jet2 = prediction(y_jet2, tX_poly_jet2, 0, lambda_jet2, 0, method)

print("pred_RR = ", pred_jet2)

pred_RR =  0.830735557472


### JET == 3.0

In [45]:
lambda_jet3 = 1e-7
method = 4

degree_jet3 = 10

tX_reformed_jet3 = tX_cleaned_jet3.copy()
w0 = np.ones([1,tX_reformed_jet3.shape[0]])
tX_reformed_jet3 = np.insert(tX_reformed_jet3, 0, w0, axis=1)

tX_poly_jet3 = md.build_poly(tX_reformed_jet3, degree_jet3)

loss_jet3, weights_jet3 = md.ridge_regression(y_jet3, tX_poly_jet3, lambda_jet3)
# print("\nweights_RR:\n", weights_jet3,"\n")

pred_jet3 = prediction(y_jet3, tX_poly_jet3, 0, lambda_jet3, 0, method)
print("pred_RR = ", pred_jet3)

pred_RR =  0.830571727123


## Making the submission file

In [46]:
DATA_TEST_PATH = '../data/test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test.shape

(568238, 30)

In [48]:
tX_test_jet0, indexes_test_jet0, tX_test_jet1, indexes_test_jet1, tX_test_jet2, indexes_test_jet2, tX_test_jet3, indexes_test_jet3 = md.separating_by_jet(tX_test)

tX_test_cleaned_jet0 = md.clean_data(tX_test_jet0)
tX_test_cleaned_jet1 = md.clean_data(tX_test_jet1)
tX_test_cleaned_jet2 = md.clean_data(tX_test_jet2)
tX_test_cleaned_jet3 = md.clean_data(tX_test_jet3)



In [49]:
tX_test_reformed_jet0 = tX_test_cleaned_jet0.copy()
w0 = np.ones([1,tX_test_reformed_jet0.shape[0]])
tX_test_reformed_jet0 = np.insert(tX_test_reformed_jet0, 0, w0, axis=1)

tX_test_reformed_jet1 = tX_test_cleaned_jet1.copy()
w0 = np.ones([1,tX_test_reformed_jet1.shape[0]])
tX_test_reformed_jet1 = np.insert(tX_test_reformed_jet1, 0, w0, axis=1)

tX_test_reformed_jet2 = tX_test_cleaned_jet2.copy()
w0 = np.ones([1,tX_test_reformed_jet2.shape[0]])
tX_test_reformed_jet2 = np.insert(tX_test_reformed_jet2, 0, w0, axis=1)

tX_test_reformed_jet3 = tX_test_cleaned_jet3.copy()
w0 = np.ones([1,tX_test_reformed_jet3.shape[0]])
tX_test_reformed_jet3 = np.insert(tX_test_reformed_jet3, 0, w0, axis=1)

In [50]:
tX_test_poly_jet0 = md.build_poly(tX_test_reformed_jet0, degree_jet0)
tX_test_poly_jet1 = md.build_poly(tX_test_reformed_jet1, degree_jet1)
tX_test_poly_jet2 = md.build_poly(tX_test_reformed_jet2, degree_jet2)
tX_test_poly_jet3 = md.build_poly(tX_test_reformed_jet3, degree_jet3)

In [51]:
y_pred_jet0 = predict_labels(weights_jet0, tX_test_poly_jet0)
y_pred_jet1 = predict_labels(weights_jet1, tX_test_poly_jet1)
y_pred_jet2 = predict_labels(weights_jet2, tX_test_poly_jet2)
y_pred_jet3 = predict_labels(weights_jet3, tX_test_poly_jet3)

In [52]:
y_pred_final = np.ones((tX_test.shape[0], 1))
a = 0
b = 0
c = 0
d = 0
for i in range(0, y_pred_final.shape[0]):
    if indexes_test_jet0[i] == True:
        y_pred_final[i] = y_pred_jet0[a]
        a = a + 1
    if indexes_test_jet1[i] == True:
        y_pred_final[i] = y_pred_jet1[b]
        b = b + 1
    if indexes_test_jet2[i] == True:
        y_pred_final[i] = y_pred_jet2[c]
        c = c + 1
    if indexes_test_jet3[i] == True:
        y_pred_final[i] = y_pred_jet3[d]
        d = d + 1

In [53]:
y_pred_final.shape

(568238, 1)

In [54]:
OUTPUT_PATH = '../data/dataSubmission_JET_RR.csv' 
create_csv_submission(ids_test, y_pred_final, OUTPUT_PATH)