In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
import datetime
from functions import *
from helpers import *
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
# Github does not accept files above 100mb and test.csv is 104mb
# thus we upload zip whith test.csv which needs to be extracted
with zipfile.ZipFile("../data/test.csv.zip","r") as zip_ref:
    zip_ref.extractall("../data/")

In [3]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)

In [6]:
#Let's modify the data: replace -999 by -99
x_99 = x
np.putmask(x_99, x_99==-999, -99)
#PCA
eigenvectors, eigenvalues, V = np.linalg.svd(x_99.T, full_matrices=False)
x_proj = np.dot(x_99, eigenvectors[:, 0:15])

ratio = 0.1
y = np.reshape(y, (len(y), 1))
x_train, x_test, y_train, y_test = split_data(x_proj, y, ratio)

tx_train_99, tr_mean_99, tr_std_99 = standardize(x_train)
tx_test_99, te_mean_99, te_std_99 = standardize(x_test)

#Least squares
print("\t*** LS ****")
w_LS, loss = least_squares(y_train, tx_train_99)
test_mse = compute_loss(y_test, tx_test_99, w_LS)
train_mse = compute_loss(y_train, tx_train_99, w_LS)
rmse_te = np.sqrt(2*test_mse)
rmse_tr = np.sqrt(2*train_mse)
print(rmse_te)
print(rmse_tr)

#Gradient Descent
print("\t*** GS ****")
alpha = 0.00001
max_iters = 5
loss_GD, w_GD = learning_by_gradient_descent(y_train, tx_train_99, alpha, max_iters)
test_mse_GD = compute_loss(y_test, tx_test_99, w_GD)
train_mse_GD = compute_loss(y_train, tx_train_99, w_GD)
rmse_te_GD = np.sqrt(2*test_mse_GD)
rmse_tr_GD = np.sqrt(2*train_mse_GD)
print(rmse_te_GD)
print(rmse_tr_GD)

#Newton
print("\t*** N ****")
alpha = 0.00002
max_iters = 5
lambd = 0.5
loss_N, w_N = learning_by_newton_method(y_train, tx_train_99, alpha, max_iters)
test_mse_N = compute_loss(y_test, tx_test_99, w_N)
train_mse_N = compute_loss(y_train, tx_train_99, w_N)
rmse_te_N = np.sqrt(2*test_mse_N)
rmse_tr_N = np.sqrt(2*train_mse_N)
print(rmse_te_N)
print(rmse_tr_N)

#Mean of 10 folds least squares
print("\t*** CLS ****")
k_folds = 10
seed = 1
w_LS_folds, rmse_tr_CLS, rmse_te_CLS = cross_validation_LS(y, x_proj, k_folds, seed)

#Mean of weights along folds
w_CLS = w_LS_folds.mean(axis=0)
w_CLS = np.reshape(w_CLS, (tx_train_99.shape[1], 1))
test_mse_CLS_mean = compute_loss(y_test, tx_test_99, w_CLS)
train_mse_CLS_mean = compute_loss(y_train, tx_train_99, w_CLS)
rmse_te_CLS_mean = np.sqrt(2*test_mse_CLS_mean)
rmse_tr_CLS_mean = np.sqrt(2*train_mse_CLS_mean)
print("-mean weights:")
print(rmse_te_CLS_mean)
print(rmse_tr_CLS_mean)

#Best weights in test results
w_CLS_best = w_LS_folds[np.argmin(rmse_te_CLS)]
w_CLS_best = np.reshape(w_CLS_best, (tx_train_99.shape[1], 1))
test_mse_CLS_best = compute_loss(y_test, tx_test_99, w_CLS_best)
train_mse_CLS_best = compute_loss(y_train, tx_train_99, w_CLS_best)
rmse_te_CLS_best = np.sqrt(2*test_mse_CLS_best)
rmse_tr_CLS_best = np.sqrt(2*train_mse_CLS_best)
print("-best weights:")
print(rmse_te_CLS_best)
print(rmse_tr_CLS_best)

#Penalized Regression
lambd = 0.05
gamma = 0.00002
max_iters = 5
print("\t**** Penalized *******")
loss_reg, w_reg = reg_logistic_regression(y_train, tx_train_99, lambd, gamma, max_iters)
test_mse_reg = compute_loss(y_test, tx_test_99, w_reg)
train_mse_reg = compute_loss(y_train, tx_train_99, w_reg)
rmse_te_reg = np.sqrt(2*test_mse_reg)
rmse_tr_reg = np.sqrt(2*train_mse_reg)
print(rmse_te_reg)
print(rmse_tr_reg)

	*** LS ****
0.843705415289
0.846162529975
	*** GS ****
3.66057618293
3.69187552172
	*** N ****
4.40101314193
4.43857907606
	*** CLS ****
-mean weights:
0.945358319202
0.944921623947
-best weights:
0.945353938203
0.944920581668
	**** Penalized *******
4.40103100733
4.43859711086


In [None]:
#IRLS
x_99 = x
np.putmask(x_99, x_99==-999, -99)
#PCA to avoid correlation and singular matrix
eigenvectors, eigenvalues, V = np.linalg.svd(x_99.T, full_matrices=False)
x_proj = np.dot(x_99, eigenvectors[:, 0:10])
y = np.reshape(y, (len(y), 1))

ratio = 0.05
x_train_PCA, x_test_PCA, y_train_PCA, y_test_PCA = split_data(x_proj, y, ratio)
tx_train_PCA = np.c_[np.ones((y_train_PCA.shape[0], 1)), x_train_PCA]
tx_test_PCA = np.c_[np.ones((y_test_PCA.shape[0], 1)), x_test_PCA]

max_iters = 5
print("\t*** IRLS *****")
loss_IRLS, w_IRLS = learning_by_IRLS(y_train_PCA, tx_train_PCA, max_iters)
test_mse_IRLS = compute_loss(y_test_PCA, tx_test_PCA, w_IRLS)
train_mse_IRLS = compute_loss(y_train_PCA, tx_train_PCA, w_IRLS)
rmse_te_IRLS = np.sqrt(2*test_mse_IRLS)
rmse_tr_IRLS = np.sqrt(2*train_mse_IRLS)
print(rmse_te_IRLS)
print(rmse_tr_IRLS)

In [None]:
plt.plot(range(0,k_folds), rmse_tr_CLS)
plt.plot(range(0,k_folds), rmse_te_CLS, color='r')

In [None]:
mean_w = w_LS.mean(axis=0)
std_w = w_LS.std(axis=0)
rmse_tr = compute_loss(y_train, tx_train_99, mean_w)
rmse_te = compute_loss(y_test, tx_test_99, mean_w)
print(rmse_tr)
print(rmse_te)

In [None]:
x_s = standardize(x)
eigenvectors, eigenvalues, V = np.linalg.svd(x.T, full_matrices=False)
x_proj = np.dot(x, eigenvectors[:, 0:12])
print(eigenvectors.shape)
print(eigenvalues.shape)
print(V.shape)
print(x_proj.shape)
sigma = x_proj.std(axis=0).mean()
print(sigma)

# Do your thing crazy machine learning thing here :) ...

In [None]:
max_iter = 100
threshold = 1e-8
alpha = 0.001
ratio = 0.1
lambd = 0.01
losses = []

y = np.reshape(y, (len(y), 1))


x_train, x_test, y_train, y_test = split_data(x, y_vec, ratio)
tx_train = np.c_[np.ones((y_train.shape[0], 1)), x_train]
w = np.zeros((tx_train.shape[1], 1))

In [None]:
ratio = 0.1
x_train, x_test, y_train, y_test = split_data(x, y, ratio)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
alpha = 0.001
x_train, x_test, y_train, y_test = split_data(x, y, ratio)
tx_train = np.c_[np.ones((x_train.shape[0], 1)), x_train]
tx_test = np.c_[np.ones((x_test.shape[0], 1)), x_test]

loss, w = learning_by_newton_method(y_train, tx_train, alpha, 100)
rmse  = compute_loss(y_test, tx_test, w)
print(rmse)

In [None]:
max_iter = 10
threshold = 1e-8
#alpha = 0.001
lambd = 0.5
ratio = 0.1
losses = []

gammas = np.logspace(-5, -1, 10)
y = np.reshape(y, (len(y), 1))
x_train, x_test, y_train, y_test = split_data(x, y, ratio)

#tx_train, x_tr_mean, x_tr_std = standardize(x_train)
#tx_test, x_te_mean, x_te_std = standardize(x_test)

tx_train = np.c_[np.ones((x_train.shape[0], 1)), x_train]
tx_test = np.c_[np.ones((x_test.shape[0], 1)), x_test]
for gamma in gammas:
    loss, w = learning_by_newton_method(y_train, tx_train, gamma, max_iter)
    lossREG = compute_loss(y_test, tx_test, w)
    rmse = np.sqrt(2*lossREG)
    losses.append(rmse)

In [None]:
plt.semilogx(gammas[0:5], losses[0:5], marker=".", color='b')
plt.xlabel("lambda")
plt.ylabel("rmse")
plt.grid(True)

In [None]:
#Lets test some basics: Least Squares Gradient Descent
from plots import gradient_descent_visualization

# Define the parameters of the algorithm.
#max_iters = 1
#gamma = 0.4
#batch_size = 300
max_iter = 1000
threshold = 1e-8
alpha = 0.002
lambd = 0.001
ratio = 0.1
losses = []

# Initialization
#w_initial = weights

y = np.reshape(y, (len(y), 1))

x_train, x_test, y_train, y_test = split_data(x, y, ratio)

#tx_train = np.c_[np.ones((y_train.shape[0], 1)), x_train]
#tx_test = np.c_[np.ones((y_test.shape[0], 1)), x_test]

tx_train, x_tr_mean, x_tr_std = standardize(x_train)
tx_test, x_te_mean, x_te_std = standardize(x_test)
w = np.zeros((tx_train.shape[1], 1))

start_time = datetime.datetime.now()
# start the logistic regression
for iter in range(max_iter):
    # get loss and update w.
    loss, w = learning_by_gradient_descent(y_train, tx_train, w, alpha)
    if iter % 20 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
    losses.append(loss)
    if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
        break
# visualization
#visualization(y_train, x_train, mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
print("The loss={l}".format(l=compute_loss(y_test, tx_test, w)))
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Gradient Descent: execution time={t:.3f} seconds".format(t=exection_time))

In [None]:
best_weights = w

In [None]:
best_weights_std = w

In [None]:
try1, x_mean, std_x = standardize(x)
#try1 = np.reshape(try1, (x.shape[0], x.shape[1]))
print(try1.shape)

In [None]:
weights = w
print("RMSE: {l}".format(l=compute_RMSE(y_test, tx_test, w)))
k_fold = 10
seed = 1
lambd = 0.01
w_cv, lossTR_cv, lossTE_cv = cross_validation(y_vec, x, k_fold, seed, lambd)

In [None]:
# Define the parameters of the algorithm.
#max_iters = 1
#gamma = 0.4
#batch_size = 300
max_iter = 100
threshold = 1e-8
alpha = 0.002
lambd = 0.001
ratio = 0.1
losses = []

# Initialization
#w_initial = weights

y_vec = np.zeros((y.shape[0], 1))
y_vec[:,0] = y

x_train, x_test, y_train, y_test = split_data(x, y_vec, ratio)
tx_train = np.c_[np.ones((y_train.shape[0], 1)), x_train]
tx_test = np.c_[np.ones((y_test.shape[0], 1)), x_test]
w = np.zeros((tx_train.shape[1], 1))

start_time = datetime.datetime.now()
# start the logistic regression
for iter in range(max_iter):
    # get loss and update w.
    loss, w = learning_by_newton_method(y_train, tx_train, w, alpha)
    if iter % 20 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
    losses.append(loss)
    if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
        break
# visualization
#visualization(y_train, x_train, mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
print("The loss={l}".format(l=compute_RMSE(y_test, tx_test, w)))
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Gradient Descent: execution time={t:.3f} seconds".format(t=exection_time))

# PCA

In [None]:
#max_iters = 1
#gamma = 0.4
#batch_size = 300
max_iter = 10
threshold = 1e-8
alpha = 0.001
lambd = 0.001
ratio = 0.1
losses = []

eigenvectors, eigenvalues, V = np.linalg.svd(x.T, full_matrices=False)
x_proj = np.dot(x, eigenvectors[:, 0:10])
y = np.reshape(y, (len(y), 1))

x_train, x_test, y_train, y_test = split_data(x_proj, y, ratio)
tx_train = np.c_[np.ones((y_train.shape[0], 1)), x_train]
tx_test = np.c_[np.ones((y_test.shape[0], 1)), x_test]
w = np.zeros((tx_train.shape[1], 1))

start_time = datetime.datetime.now()
# start the logistic regression
for iter in range(max_iter):
    # get loss and update w.
    loss, w = learning_by_newton_method(y_train, tx_train, w, alpha)
    if iter % 10 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
    losses.append(loss)
    if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
        break
# visualization
#visualization(y_train, x_train, mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")

mse = compute_loss(y_test, tx_test, w)
rmse = np.sqrt(2*mse)
print("The loss={l}".format(l=rmse))
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Newton: execution time={t:.3f} seconds".format(t=exection_time))



# ERROR

In [None]:
mse = compute_loss(y_test, tx_test, w)
rmse = np.sqrt(2*mse)
print(rmse)

# K-fold

In [None]:
max_iter = 20
threshold = 1e-8
alpha = 0.001
lambd = 0.001
ratio = 0.1
losses = []
k_fold = 20
seed = 1

#x_train, x_test, y_train, y_test = split_data(x, y, ratio)
#tx_train = np.c_[np.ones((y_train.shape[0], 1)), x_train]
#tx_test = np.c_[np.ones((y_test.shape[0], 1)), x_test]
#w = np.ones((x_train.shape[1]+1, 1))
y = np.reshape(y, (len(y), 1))

w_RegPen, tr_rmse, te_rmse = cross_validation(y, x, k_fold, seed, lambd, max_iter)
print(w_LS.std(axis=0).mean())
print(tr_rmse)
print(te_rmse)

In [None]:
weights_NM_20folds = w_LS
train_rmse_NM_20folds = tr_rmse
test_rmse_NM_20folds = te_rmse

## Generate predictions and save ouput in csv format for submission:

In [7]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download test data and supply path here 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

tX_test = np.c_[np.ones((X_test.shape[0], 1)), X_test]


In [9]:
eigenvectors, eigenvalues, V = np.linalg.svd(X_test.T, full_matrices=False)
x_proj_test = np.dot(X_test, eigenvectors[:, 0:15])

tX_test_proj = np.c_[np.ones((x_proj_test.shape[0], 1)), x_proj_test]

weights = w_CLS

In [11]:
OUTPUT_PATH = '../data/submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test_proj)

create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
# Delete train.csv such that github accepts push
os.remove('../data/test.csv')