In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from Methods.costs import * 
from Methods.least_squares import * 
from Methods.ridge import *
from Methods.cross_validation import *
from Methods.split_data import *
from Methods.scaling_standardization import *
from Methods.build_polynomial import *
from Methods.clearDataset import *

%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from Methods.proj1_helpers import *
DATA_TRAIN_PATH = 'csv/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Generate predictions and save ouput in csv format for submission:

In [3]:
DATA_TEST_PATH = 'csv/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

# Data exploratory

# Feature Processing

In [4]:
tX, tX_test = averageData(tX, tX_test)
tX, tX_test = data_scaling(tX.T, tX_test.T)

# Features transformations

In [None]:
#y, x, tX, method,  tX_test, x_test, **kwargs
for i in range(0, 30): 
    tX, tX_test = add_feature(y, tX[i], tX, log_def, tX_test, tX_test[i])
    tX, tX_test = add_feature(y, tX[i], tX, multiply, tX_test, tX_test[i], degree=2)
    tX, tX_test = add_feature(y, tX[i], tX, multiply, tX_test, tX_test[i], degree=3)
    tX, tX_test = add_feature(y, tX[i], tX, sqrt_def, tX_test, tX_test[i])
    #tX, tX_test = add_feature(y, tX[i], tX, cos_def, tX_test, tX_test[i])
    tX, tX_test = add_feature(y, tX[i], tX, multiply, tX_test, tX_test[i], degree=4)
    tX, tX_test = add_feature(y, tX[i], tX, multiply, tX_test, tX_test[i], degree=5)
    tX, tX_test = add_feature(y, tX[i], tX, multiply, tX_test, tX_test[i], degree=6)
    tX, tX_test = add_feature(y, tX[i], tX, multiply, tX_test, tX_test[i], degree=7)
    #tX, tX_test = add_feature(y, tX[i], tX, multiply, tX_test, tX_test[i], degree=8)   

tX = tX.T

# Models testing

In [15]:

seed = 1
k_fold = 10

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

# define lists to store the loss of training data and test data
rmse_tr = []
rmse_te = []
k_list = list(range(k_fold))
#k=7
tot_loss_tr = 0
tot_loss_te = 0
best_accuracy = 0
best_k = 0
weights = np.array([])
for k in k_list:
    loss_tr, loss_te, accuracy_least, w = cross_validation(y, tX, k_indices, k, least_squares)
    tot_loss_tr += loss_tr
    tot_loss_te += loss_te
    if accuracy_least > best_accuracy:
        best_accuracy = accuracy_least
        best_k = k
        weights = w
rmse_tr.append(np.sqrt(2/k_fold * tot_loss_tr))
rmse_te.append(np.sqrt(2/k_fold * tot_loss_te))
print(best_accuracy)
print(best_k)
print(rmse_tr)


0.8092
1
[0.88074614831844589]


## Ridge regression

In [None]:
seed = 1
k_fold = 4

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

# define lists to store the loss of training data and test data
rmse_tr = []
rmse_te = []
k_list = list(range(k_fold))
#k=7
lambdas = np.logspace(-10, -2, 20)
tot_loss_tr = 0
tot_loss_te = 0
best_accuracy = 0
best_k = 0
best_l = 0
weights = np.array([])
for l in lambdas:
    for k in k_list:
        loss_tr, loss_te, accuracy_least, w = cross_validation(y, tX, k_indices, k, ridge_regression, lambda_=l)
        tot_loss_tr += loss_tr
        tot_loss_te += loss_te
        if accuracy_least > best_accuracy:
            best_accuracy = accuracy_least
            best_k = k
            weights = w
            best_l = l
    rmse_tr.append(np.sqrt(2/k_fold * tot_loss_tr))
    rmse_te.append(np.sqrt(2/k_fold * tot_loss_te))
    
cross_validation_visualization(lambdas, rmse_tr, rmse_te)
print(best_accuracy)
print(best_k)
print(best_l)
print(rmse_tr)

## Logistic regression

In [None]:
# Logistic regression on half of train data
y_binary = np.copy(y)
y_binary[y_binary == -1] = 0
w_log = logistic_regression_gradient_descent_demo(y_binary[:125000], tX[:125000], batch_size=3000)

In [None]:
vect = sigmoid(np.dot(tX[-125000:], w_log))
vect[vect >= 0.5] = 1
vect[vect < 0.5] = -1
e_log = y[-125000:] - vect[:,0]
rmse_log = np.sqrt(2*calculate_mse(e_log))
tot = 0
for i in range(125000, 250000):
    if y[i] == vect[i-125000]:
        tot += 1
print("Accuracy: ", tot/125000, "RMSE: ", rmse_log)

In [None]:
# Prepare output if logistic regression
pred_log = sigmoid(np.dot(tX_test.T, w_log))
pred_log = pred_log[:,0]
pred_log[pred_log >= 0.5] = 1
pred_log[pred_log < 0.5] = -1

## Penalized logistic regression

# Biais variance decomposition

# Create submission

In [50]:
OUTPUT_PATH = 'csv/sample-submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test.T)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [54]:
#create_csv_submission(ids_test, pred_log, OUTPUT_PATH)