In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load train and test data

In [2]:
from proj1_helpers import *
from data_helpers import *
from implementations import *
from cost import compute_loss


### Load train and test data

In [3]:
# Data paths
DATA_TRAIN_PATH = "../data/train.csv"

# Load data
Y, X, id_train = load_csv_data(DATA_TRAIN_PATH)

### Get index for the three different jet groups

In [24]:
jet_train_samples = get_jet_samples(X)

### Define parameters

In [25]:
degrees = range(1,20)
lambda_test = 1e-3
lambdas = np.logspace(-8,-2,7)
k_fold = 4
seed = 1
degrees, lambdas

(range(1, 20), array([1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02]))

## Group 0

In [26]:
# get train and test data
train_index = jet_train_samples[0]
x, y = X[train_index], Y[train_index]

### Best degree

In [None]:
acc_training = []
acc_validation = []
gamma = 1e-3
max_iters = 100


for degree in degrees:
        acc_tr_tmp = []
        acc_val_tmp = []
        
        for k in range(k_fold):
            x_train, y_train, x_val, y_val = cross_validation(y, x, k, k_fold, seed)
            x_tr_clean,x_val_clean = clean_data(x_train, x_val)
            y_train = classify (y_train)
            y_val = classify (y_val)
            x_train_aug, y_train = augment_data(x_tr_clean, y_train, degree)
            x_val_aug, y_val = augment_data(x_val_clean, y_val, degree)
            
            w_initial = np.zeros((x_train_aug.shape[1],1))
            
            w,_ = reg_logistic_regression (y_train, x_train_aug, lambda_test, w_initial, max_iters,gamma)
            
            acc_tr = predict_accuracy(y_train, x_train_aug, w, 'logistic')
            acc_val = predict_accuracy(y_val, x_val_aug, w, 'logistic')
            
            acc_tr_tmp.append(acc_tr)
            acc_val_tmp.append(acc_val)
            
        print(degree, np.mean(acc_tr_tmp), np.mean(acc_val_tmp))

        acc_training.append(np.mean(acc_tr_tmp))
        acc_validation.append(np.mean(acc_val_tmp))
        

  keepdims=keepdims)


1 74.48621453545788 74.48654816238289
2 74.4865481623829 74.48654816238289
3 72.19653294899511 72.24957963007446
4 73.60110230336029 73.67983825766674
5 51.65111965196039 51.52934582432541
6 72.05807777510876 71.93630394747379
7 53.05135185630021 53.149771799183284
8 70.25582512611098 70.12571062535031
9 49.330077134545064 49.35643366162223
10 69.08446099233993 69.19288974297382


In [6]:
max_accuracy = np.amax(acc_validation)
best_degree = degrees[np.argmax(acc_validation)]

plt.plot(degrees, acc_training, label='Training', marker = "o", markersize = 2, color = "red")
plt.plot(degrees, acc_validation, label='Validation', marker = "o", markersize = 2, color = "green")
plt.semilogx([degrees[0],best_degree],max_accuracy*np.ones(2),marker='--', color='blue')
plt.semilogx(best_degree*np.ones(2),[acc_training,max_accuracy],marker='--', color='blue')

plt.plot(best_degree, max_accuracy, label="Maximum accuracy", marker = '*', markersize=7, color = "blue")
plt.xticks(degrees)
plt.xlabel("Polynomial degrees")
plt.ylabel("Accuracy")
#plt.title("Cross validation for Group 0")
plt.legend()
plt.savefig("degree_group0.eps")
plt.show()

NameError: name 'acc_validation' is not defined

### Best lambda

In [36]:
acc_training = []
acc_validation = []

for ind_lambda,lambda_ in enumerate(lambdas):
        acc_tr_tmp = []
        acc_val_tmp = []
        
        for k in range(k_fold):
            x_train, y_train, x_val, y_val = cross_validation(y, x, k, k_fold, seed)
            x_tr_clean,_ = clean_data(x_train)
            x_val_clean,_ = clean_data(x_val)
            x_train_aug, y_train = augment_data(x_tr_clean, y_train, best_degree)
            x_val_aug, y_val = augment_data(x_val_clean, y_val, best_degree)
            w,_ = ridge_regression (y_train, x_train_aug, lambda_)
            
            acc_tr = predict_accuracy (y_train, x_train_aug, w)
            acc_val = predict_accuracy (y_val, x_val_aug, w)
            
            acc_tr_tmp.append(acc_tr)
            acc_val_tmp.append(acc_val)
            
        print(lambda_, np.mean(acc_tr_tmp), np.mean(acc_val_tmp))

        acc_training.append(np.mean(acc_tr_tmp))
        acc_validation.append(np.mean(acc_val_tmp))

1e-08 76.11765019884164 76.74753783329331
1e-07 84.30685670056316 84.17207142285211
1e-06 84.40627752422132 84.34422291616622
1e-05 84.43096591667333 84.35423172391705
0.0001 84.4329676782235 84.3672431739931
0.001 84.40327488189607 84.35122908159181
0.01 84.18641738062828 84.10100888782128


In [1]:
max_accuracy = np.amax(acc_validation)
best_lambda = lambdas[np.argmax(acc_validation)]

plt.semilogx(lambdas, acc_training, label='Training', marker = "o", markersize = 2, color = "red")
plt.semilogx(lambdas, acc_validation, label='Validation', marker = "o", markersize = 2, color = "green")
plt.semilogx([lambdas[0],best_lambda],max_accuracy*np.ones(2),marker='--', color='black')
plt.semilogx(best_lambda*np.ones(2),[acc_training,max_accuracy],marker='--', color='black')

plt.semilogx(best_lambda, max_accuracy, label="Maximum accuracy", marker = '*', markersize=7, color = "blue")
plt.xlabel("$\lambda$")
plt.ylabel("Accuracy (%)")
#plt.title("Cross validation for Group 0")
plt.legend()
plt.savefig("lambda_group0.eps")
plt.show()

NameError: name 'np' is not defined