In [16]:
%matplotlib inline
import numpy as np
import collections 
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from proj1_helpers import *
from data_helpers import *
from cross_validation import cross_validation
from implementations import ridge_regression
from cost import compute_loss
from plots import plot_cross_validation

## Load train and test data

### Load train and test data

In [18]:
# Data paths
DATA_TRAIN_PATH = 'data/train.csv'
DATA_TEST_PATH = 'data/test.csv'

# Load data
y_train, x_train, ids_train = load_csv_data(DATA_TRAIN_PATH)
y_test, x_test, ids_test = load_csv_data(DATA_TEST_PATH)

### Split the data in three samples in function of the jet value

In [19]:
jet_train_samples = get_jet_samples(x_train)
jet_test_samples = get_jet_samples(x_test)

### Define parameters

In [20]:
degrees = [11, 12, 13]
lambdas = [1e-4, 1e-3, 1e-3]

## Ridge regression and predictions

In [21]:
y_prediction_test = np.zeros(y_test.shape)

# iterate through three groups
for i in range(3):
    #parameters
    degree = degrees[i]
    lambda_ =lambdas[i]
    # get train and test data
    train_index = jet_train_samples[i]
    test_index = jet_test_samples[i]
    x_tr, y_tr = x_train[train_index], y_train[train_index]
    x_te, y_te = x_test[test_index], y_test[test_index]
    # clean data
    x_tr,_ = clean_data(x_tr)
    x_te,_ = clean_data(x_te)
    # build model
    y_tr, x_tr = build_model_data(x_tr, y_tr)
    y_te, x_te = build_model_data(x_te, y_te)
    # build polynomial model
    x_tr = build_poly_all_features(x_tr, degree)
    x_te = build_poly_all_features(x_te, degree)
    # ridge regression
    weights, loss = ridge_regression(y_tr, x_tr, lambda_)
    accuracy = predict_accuracy(y_tr, x_tr, weights)
    f1_score = compute_f1_score(y_tr, x_tr, weights)
    y_prediction_test[test_index] = predict_labels(weights, x_te)

0.583235946000403
0.7168091597303302
0.8128517533551859


In [22]:
print(accuracy)

83.31472368112705


## Save ouput for submission

In [23]:
#PREDICT LABELS
OUTPUT_PATH = 'data/submission.csv'
create_csv_submission(ids_test, y_prediction_test, OUTPUT_PATH)