In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
import datetime
import cProfile
from matplotlib.mlab import PCA
from implementations import *
from costs import *
from helpers import *
from proj1_helpers import *
from method_comparison_helpers import *
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
# Load Train and Test Data
with zipfile.ZipFile("../data/test.csv.zip","r") as zip_ref:
    zip_ref.extractall("../data/")
DATA_TRAIN_PATH = '../data/train.csv' 
y, tx, ids = load_csv_data(DATA_TRAIN_PATH)

## Exploratory data analysis

In [None]:
#Pure Numpy version

#Lets extract sparse columns which contain -999 values
columns = tx.min(axis=0)#tx_train.shape[1]
sparse_columns = np.array([])
for i, minimum in np.ndenumerate(columns):
    if -999 == minimum:
        sparse_columns = np.append(sparse_columns, [i])
sparse_columns = sparse_columns.astype(int)
print('Sparse columns:')        
print(sparse_columns)

# Lets replace -999 values with nan's
tx_nan = tx.copy()
tx_nan[tx_nan==-999]=np.nan
print(np.nanmean(tx_nan, axis=0))
print(np.nanstd(tx_nan, axis=0))
print(np.nanmin(tx_nan, axis=0))
print(np.nanmax(tx_nan, axis=0))

In [None]:
# PURE NUMPY VERSION

#Lets fill NaNs with -99
tx_99_filled = tx_nan.copy()
where_are_NaNs = np.isnan(tx_99_filled)
tx_99_filled[where_are_NaNs] = -99
print(np.mean(tx_99_filled, axis=0))
print(np.std(tx_99_filled, axis=0))
print(np.min(tx_99_filled, axis=0))
print(np.max(tx_99_filled, axis=0))

In [None]:
# EXPLORATORY DATASET 1
# PURE NUMPY VERSION

#Lets fill NaNs with column's mean value
tx_mean_filled = tx_nan.copy()
#Obtain mean of columns as you need, nanmean is just convenient.
mean = np.nanmean(tx_mean_filled, axis=0)
#Find indicies that you need to replace
inds = np.where(np.isnan(tx_mean_filled))
#Place column means in the indices. Align the arrays using take
tx_mean_filled[inds]=np.take(mean,inds[1])
#Lets normalize
tx_mean_filled_normalized = (tx_mean_filled - tx_mean_filled.mean(axis=0)) / tx_mean_filled.std(axis=0, ddof=1)
print(np.mean(tx_mean_filled_normalized, axis=0))
print(np.std(tx_mean_filled_normalized, axis=0))
print(np.min(tx_mean_filled_normalized, axis=0))
print(np.max(tx_mean_filled_normalized, axis=0))

In [None]:
# EXPLORATORY DATASET 2
# PURE NUMPY VERSION

#Lets fill NaNs with 0
tx_zero_filled = tx_nan.copy()
where_are_NaNs = np.isnan(tx_zero_filled)
tx_zero_filled[where_are_NaNs] = 0
#Lets normalize
tx_zero_filled_normalized = (tx_zero_filled - tx_zero_filled.mean(axis=0)) / tx_zero_filled.std(axis=0, ddof=1)
print(np.mean(tx_zero_filled_normalized, axis=0))
print(np.std(tx_zero_filled_normalized, axis=0))
print(np.min(tx_zero_filled_normalized, axis=0))
print(np.max(tx_zero_filled_normalized, axis=0))

In [None]:
# EXPLORATORY DATASET 3
# PURE NUMPY

# Lets sum all sprase columns and combine it into new one
tx_sparse_dropped = tx_nan.copy()
sparse_sum = np.array([np.nansum(tx_sparse_dropped[:,sparse_columns],axis=1)]).T
# print(sparse_sum)
# Delete sparse columns
tx_sparse_dropped = np.delete(tx_sparse_dropped, sparse_columns, axis=1)
tx_sparse_dropped = np.hstack((tx_sparse_dropped, sparse_sum))
# Normalize
tx_sparse_dropped_normalized = (tx_sparse_dropped - tx_sparse_dropped.mean(axis=0)) / tx_sparse_dropped.std(axis=0, ddof=1)

print(np.mean(tx_sparse_dropped_normalized, axis=0))
print(np.std(tx_sparse_dropped_normalized, axis=0))
print(np.min(tx_sparse_dropped_normalized, axis=0))
print(np.max(tx_sparse_dropped_normalized, axis=0))

In [None]:
# EXPLORATORY 4
# PURE NUMPY VERSION

# instead of -999 we have -99
tx_99_filled = tx_nan.copy()
where_are_NaNs = np.isnan(tx_99_filled)
tx_99_filled[where_are_NaNs] = -99
#Lets normalize
tx_99_filled_normalized = (tx_99_filled - tx_99_filled.mean(axis=0)) / tx_99_filled.std(axis=0, ddof=1)
print(np.mean(tx_99_filled_normalized, axis=0))
print(np.std(tx_99_filled_normalized, axis=0))
print(np.min(tx_99_filled_normalized, axis=0))
print(np.max(tx_99_filled_normalized, axis=0))

In [None]:
import itertools  

# Polynomial helper functions

def power_of_2_tx(x):
    return x*x


def combinations(array2d, indeces_list_a, indeces_list_b):
    combinations = list(itertools.product(indeces_list_a, indeces_list_b))
    for comb in combinations:
        new_feature = np.array([array2d[:,comb[0]] * array2d[:,comb[1]]]).T
        array2d = np.hstack((array2d, new_feature))
    return array2d

# arr = np.array([[1,2],[1,2],[1,2]])
# print(arr)
# combinations(arr, [0], [1])
#         numpy.apply_along_axis

In [None]:
# Exploratory DATASET 5
# normalized -99 filled dataset with base dimensions, 
# (base dimensions)^2 and permutatons of dimensions which are highly corelated 
# PURE NUMPY VERSION

tx_polynomial = tx_nan.copy()
tx_squared = np.apply_along_axis(power_of_2_tx, 1, tx_polynomial)
tx_polynomial = np.hstack((tx_polynomial, tx_squared))
tx_polynomial_normalized = (tx_polynomial - np.nanmean(tx_polynomial, axis=0)) / np.nanstd(tx_polynomial, axis=0, ddof=1)
tx_polynomial_normalized = combinations(tx_polynomial_normalized, [0], [1,2,7,8])
tx_polynomial_normalized = combinations(tx_polynomial_normalized, [4], [5,6])
tx_polynomial_normalized = combinations(tx_polynomial_normalized, [10], [16])
tx_polynomial_normalized = combinations(tx_polynomial_normalized, [21], [23,26,29])
tx_polynomial_normalized = combinations(tx_polynomial_normalized, [23], [26,29])
where_are_NaNs = np.isnan(tx_polynomial_normalized)
tx_polynomial_normalized[where_are_NaNs] = -5 #because np.min(np.nanmin(tx_polynomial_normalized, axis=0)) = -85
print(tx_polynomial_normalized.shape)

In [None]:
# Exploratory DATASET 6
# binarized categories, normalized -99 filled dataset with base dimensions, 
# (base dimensions)^2 and permutatons of dimensions which are highly corelated 
# PURE NUMPY VERSION

tx_bin_polynomial = tx_nan.copy()
tx_squared = np.apply_along_axis(power_of_2_tx, 1, tx_bin_polynomial)
tx_bin_polynomial = np.hstack((tx_bin_polynomial, tx_squared))


categories_list = np.unique(tx_bin_polynomial[:,22]).astype(int)
categories = np.empty(shape=[tx_bin_polynomial.shape[0], 0])
for cat in categories_list:
    zeros = np.zeros((tx_bin_polynomial.shape[0], 1))
    inds = np.where(tx_bin_polynomial[:,22] == cat)
    zeros[inds,0] = 1
    categories = np.hstack((categories, zeros))
tx_bin_polynomial = np.hstack((categories, tx_bin_polynomial))
tx_bin_polynomial = np.delete(tx_bin_polynomial, [22], axis=1)
tx_bin_polynomial_normalized = (tx_bin_polynomial - np.nanmean(tx_bin_polynomial, axis=0)) / np.nanstd(tx_bin_polynomial, axis=0, ddof=1)

tx_bin_polynomial_normalized = combinations(tx_bin_polynomial_normalized, [0], [1,2,7,8])
tx_bin_polynomial_normalized = combinations(tx_bin_polynomial_normalized, [4], [5,6])
tx_bin_polynomial_normalized = combinations(tx_bin_polynomial_normalized, [10], [16])
tx_bin_polynomial_normalized = combinations(tx_bin_polynomial_normalized, [21], [23,26,29])
tx_bin_polynomial_normalized = combinations(tx_bin_polynomial_normalized, [23], [26,29])
where_are_NaNs = np.isnan(tx_bin_polynomial_normalized)
tx_bin_polynomial_normalized[where_are_NaNs] = -5 #because np.min(np.nanmin(tx_bin_polynomial_normalized, axis=0)) = -85
print(tx_bin_polynomial_normalized.shape)

In [None]:
# Lets do exactly the same for Predtiction dataset
# PURE NUMPY VERSION
DATA_PRED_PATH = '../data/test.csv'
_, tx_pred, ids_pred = load_csv_data(DATA_PRED_PATH)

# Lets replace -999 values with nan's
tx_pred_nan = tx_pred.copy()
tx_pred_nan[tx_pred_nan==-999]=np.nan

# EXPLORATORY DATASET 1
#Lets fill NaNs with column's mean value
tx_pred_mean_filled = tx_pred_nan.copy()
#Obtain mean of columns as you need, nanmean is just convenient.
mean = np.nanmean(tx_pred_mean_filled, axis=0)
#Find indicies that you need to replace
inds = np.where(np.isnan(tx_pred_mean_filled))
#Place column means in the indices. Align the arrays using take
tx_pred_mean_filled[inds]=np.take(mean,inds[1])
#Lets normalize
tx_pred_mean_filled_normalized = (tx_pred_mean_filled - tx_pred_mean_filled.mean(axis=0)) / tx_pred_mean_filled.std(axis=0, ddof=1)


# EXPLORATORY DATASET 2
#Lets fill NaNs with 0
tx_pred_zero_filled = tx_pred_nan.copy()
where_are_NaNs = np.isnan(tx_pred_zero_filled)
tx_pred_zero_filled[where_are_NaNs] = 0
#Lets normalize
tx_pred_zero_filled_normalized = (tx_pred_zero_filled - tx_pred_zero_filled.mean(axis=0)) / tx_pred_zero_filled.std(axis=0, ddof=1)


# EXPLORATORY DATASET 3
# Lets sum all sprase columns and combine it into new one
tx_pred_sparse_dropped = tx_pred_nan.copy()
sparse_sum = np.array([np.nansum(tx_pred_sparse_dropped[:,sparse_columns],axis=1)]).T
# print(sparse_sum)
# Delete sparse columns
tx_pred_sparse_dropped = np.delete(tx_pred_sparse_dropped, sparse_columns, axis=1)
tx_pred_sparse_dropped = np.hstack((tx_pred_sparse_dropped, sparse_sum))
# Normalize
tx_pred_sparse_dropped_normalized = (tx_pred_sparse_dropped - tx_pred_sparse_dropped.mean(axis=0)) / tx_pred_sparse_dropped.std(axis=0, ddof=1)


# EXPLORATORY 4
# instead of -999 we have -99
tx_pred_99_filled = tx_pred_nan.copy()
where_are_NaNs = np.isnan(tx_pred_99_filled)
tx_pred_99_filled[where_are_NaNs] = -99
#Lets normalize
tx_pred_99_filled_normalized = (tx_pred_99_filled - tx_pred_99_filled.mean(axis=0)) / tx_pred_99_filled.std(axis=0, ddof=1)

# Exploratory DATASET 5
# normalized -5 filled dataset with base dimensions, 
# (base dimensions)^2 and permutatons of dimensions which are highly corelated 
tx_pred_polynomial = tx_pred_nan.copy()
tx_pred_squared = np.apply_along_axis(power_of_2_tx, 1, tx_pred_polynomial)
tx_pred_polynomial = np.hstack((tx_pred_polynomial, tx_pred_squared))
tx_pred_polynomial_normalized = (tx_pred_polynomial - np.nanmean(tx_pred_polynomial, axis=0)) / np.nanstd(tx_pred_polynomial, axis=0, ddof=1)
tx_pred_polynomial_normalized = combinations(tx_pred_polynomial_normalized, [0], [1,2,7,8])
tx_pred_polynomial_normalized = combinations(tx_pred_polynomial_normalized, [4], [5,6])
tx_pred_polynomial_normalized = combinations(tx_pred_polynomial_normalized, [10], [16])
tx_pred_polynomial_normalized = combinations(tx_pred_polynomial_normalized, [21], [23,26,29])
tx_pred_polynomial_normalized = combinations(tx_pred_polynomial_normalized, [23], [26,29])
where_are_NaNs = np.isnan(tx_pred_polynomial_normalized)
tx_pred_polynomial_normalized[where_are_NaNs] = -5


# Exploratory DATASET 6
# binarized categories, normalized -99 filled dataset with base dimensions, 
# (base dimensions)^2 and permutatons of dimensions which are highly corelated 
tx_pred_bin_polynomial = tx_pred_nan.copy()
tx_pred_squared = np.apply_along_axis(power_of_2_tx, 1, tx_pred_bin_polynomial)
tx_pred_bin_polynomial = np.hstack((tx_pred_bin_polynomial, tx_pred_squared))

categories_list = np.unique(tx_pred_bin_polynomial[:,22]).astype(int)
categories = np.empty(shape=[tx_pred_bin_polynomial.shape[0], 0])
for cat in categories_list:
    zeros = np.zeros((tx_pred_bin_polynomial.shape[0], 1))
    inds = np.where(tx_pred_bin_polynomial[:,22] == cat)
    zeros[inds,0] = 1
    categories = np.hstack((categories, zeros))
tx_pred_bin_polynomial = np.hstack((categories, tx_pred_bin_polynomial))
tx_pred_bin_polynomial = np.delete(tx_pred_bin_polynomial, [22], axis=1)
tx_pred_bin_polynomial_normalized = (tx_pred_bin_polynomial - np.nanmean(tx_pred_bin_polynomial, axis=0)) / np.nanstd(tx_pred_bin_polynomial, axis=0, ddof=1)


tx_pred_bin_polynomial_normalized = combinations(tx_pred_bin_polynomial_normalized, [0], [1,2,7,8])
tx_pred_bin_polynomial_normalized = combinations(tx_pred_bin_polynomial_normalized, [4], [5,6])
tx_pred_bin_polynomial_normalized = combinations(tx_pred_bin_polynomial_normalized, [10], [16])
tx_pred_bin_polynomial_normalized = combinations(tx_pred_bin_polynomial_normalized, [21], [23,26,29])
tx_pred_bin_polynomial_normalized = combinations(tx_pred_bin_polynomial_normalized, [23], [26,29])
where_are_NaNs = np.isnan(tx_pred_bin_polynomial_normalized)
tx_pred_bin_polynomial_normalized[where_are_NaNs] = -5 #because np.min(np.nanmin(tx_pred_bin_polynomial_normalized, axis=0)) = -85
print(tx_pred_bin_polynomial_normalized.shape)

In [None]:
# Lets split tx to train and test
split_ratio = 0.2
tx_train, tx_test, y_train, y_test = split_data(tx, y, split_ratio)
tx_zero_filled_normalized_train, tx_zero_filled_normalized_test, y_zero_filled_normalized_train, y_zero_filled_normalized_test = split_data(tx_zero_filled_normalized, y, split_ratio)
tx_mean_filled_normalized_train, tx_mean_filled_normalized_test, y_mean_filled_normalized_train, y_mean_filled_normalized_test = split_data(tx_mean_filled_normalized, y, split_ratio)
tx_sparse_dropped_normalized_train, tx_sparse_dropped_normalized_test, y_sparse_dropped_normalized_train, y_sparse_dropped_normalized_test = split_data(tx_sparse_dropped_normalized, y, split_ratio)
tx_99_filled_normalized_train, tx_99_filled_normalized_test, y_99_filled_normalized_train, y_99_filled_normalized_test = split_data(tx_99_filled_normalized, y, split_ratio)
tx_polynomial_normalized_train, tx_polynomial_normalized_test, y_polynomial_normalized_train, y_polynomial_normalized_test = split_data(tx_polynomial_normalized, y, split_ratio)
tx_bin_polynomial_normalized_train, tx_bin_polynomial_normalized_test, y_bin_polynomial_normalized_train, y_bin_polynomial_normalized_test = split_data(tx_bin_polynomial_normalized, y, split_ratio)

In [None]:
# Create list of datasets

# Test datasets
train_datasets = [tx_train, tx_zero_filled_normalized_train, tx_mean_filled_normalized_train, tx_sparse_dropped_normalized_train, tx_99_filled_normalized_train, tx_polynomial_normalized_train, tx_bin_polynomial_normalized_train]
new_train_datasets = [tx_zero_filled_normalized_train, tx_mean_filled_normalized_train, tx_sparse_dropped_normalized_train, tx_99_filled_normalized_train, tx_polynomial_normalized_train, tx_bin_polynomial_normalized_train]

# Test datasets
test_datasets = [tx_test, tx_zero_filled_normalized_test, tx_mean_filled_normalized_test, tx_sparse_dropped_normalized_test, tx_99_filled_normalized_test, tx_polynomial_normalized_test, tx_bin_polynomial_normalized_test]
new_test_datasets = [tx_zero_filled_normalized_test, tx_mean_filled_normalized_test, tx_sparse_dropped_normalized_test, tx_99_filled_normalized_test, tx_polynomial_normalized_test, tx_bin_polynomial_normalized_test]

# Prediction datasets
pred_datasets = [tx_pred, tx_pred_mean_filled_normalized, tx_pred_zero_filled_normalized, tx_pred_sparse_dropped_normalized, tx_pred_99_filled_normalized, tx_pred_polynomial_normalized, tx_pred_bin_polynomial_normalized]
new_pred_datasets = [tx_pred_mean_filled_normalized, tx_pred_zero_filled_normalized, tx_pred_sparse_dropped_normalized, tx_pred_99_filled_normalized, tx_pred_polynomial_normalized, tx_pred_bin_polynomial_normalized]

datasets_names = ['Original/Raw', 'Zero filled', 'Mean filled', 'NaN dropped', '-99 filled', 'Polynomial', 'Binary Category Polynomial']
new_datasets_names = ['Zero filled', 'Mean filled', 'NaN dropped', '-99 filled', 'Polynomial', 'Binary Category Polynomial']

## Logistic Regression

In [None]:
#Parameters 
max_iters = 5000
gammas = np.linspace(1.6e-12, 0.8e-11, 8)

In [None]:
train, test, weight = logistic_regression_dataset_gammas_test(y_polynomial_normalized_train,
                                                              y_polynomial_normalized_test,
                                                              tx_polynomial_normalized_train, 
                                                              tx_polynomial_normalized_test,
                                                              max_iters,
                                                              gammas, 
                                                              'Polynomial',
                                                              1)

In [None]:
gamma = 0.8e-11
max_iters = 1000
logistic_regression_dataset_single_gamma_test(y_polynomial_normalized_train,
                                              y_polynomial_normalized_test,
                                              tx_polynomial_normalized_train,
                                              tx_polynomial_normalized_test,
                                              max_iters,
                                              gamma,
                                              'Squared Normalized')

In [None]:
# Logistic Regression of all datasets vs gammas. 
# WARNING Takes a lot of time for all datasets (~30 min)

#Parameters 
max_iters = 100
gammas = np.logspace(-12, -11, 2)

for i in range(len(new_train_datasets)):
    # Parameters
    train_dataset = new_train_datasets[i]
    test_dataset = new_test_datasets[i]
    dataset_name = new_datasets_names[i]
    figure_id = i
    
    logistic_regression_dataset_gammas_test(y_train,
                                            y_test,
                                            train_dataset,
                                            test_dataset,
                                            max_iters,
                                            gammas,
                                            dataset_name,
                                            figure_id)

In [None]:
# Parameters
max_iters = 100
gamma = 1e-16

In [None]:
train_losses = []
test_losses = []
weights = []
for i in range(len(new_train_datasets)):
    train_dataset = new_train_datasets[i]
    test_dataset = new_test_datasets[i]
    dataset_name = new_datasets_names[i]
        
    train_rmse, test_rmse, weight = logistic_regression_dataset_single_gamma_test(y_train,
                                                                                  y_test,
                                                                                  train_dataset,
                                                                                  test_dataset,
                                                                                  max_iters,
                                                                                  gamma,
                                                                                  dataset_name)
    weights.append(weight)
    train_losses = np.append(train_losses, train_rmse)
    test_losses = np.append(test_losses, test_rmse)

plt.figure(1)
plt.title("Losses of Test Datasets")
plt.plot(range(len(new_train_datasets)), test_losses, marker=".", color='r', label='test error')
plt.xlabel("Datasets")
plt.ylabel("RMSE")
plt.grid(True)
plt.legend()
plt.figure(2)
plt.title("Losses of Train Datasets")
plt.plot(range(len(new_train_datasets)), train_losses, marker=".", color='b', label='train error')
plt.xlabel("Datasets")
plt.ylabel("RMSE")
plt.grid(True)
plt.legend()

## Least Squares

### Gradient Descent

In [None]:
max_iters= 100
gamma = 1.2e-3
initial_w = np.zeros(tx_polynomial_normalized_train.shape[1])
gradient_w, train_rmse = least_squares_GD(y_polynomial_normalized_train,
                                          tx_polynomial_normalized_train,
                                          initial_w,
                                          max_iters,
                                          gamma)
test_rmse = compute_RMSE(y_polynomial_normalized_test, tx_polynomial_normalized_test, gradient_w)
print(train_rmse)
print(test_rmse)

In [None]:
max_iters = 1000
gammas = np.linspace(1e-3, 0.4e-2, 10)
for i in range(len(new_train_datasets)):
    # Parameters
    train_dataset = new_train_datasets[i]
    test_dataset = new_test_datasets[i]
    dataset_name = new_datasets_names[i]
    figure_id = i
    
    least_squares_GD_gammas_test(y_train, y_test, train_dataset, test_dataset, gammas, max_iters, dataset_name, figure_id)

In [None]:
train_losses = []
test_losses = []
weights = []
for i in range(len(new_train_datasets)):
    train_dataset = new_train_datasets[i]
    test_dataset = new_test_datasets[i]
    dataset_name = new_datasets_names[i]
    
    start_time = datetime.datetime.now()
    least_squares_w, train_rmse  = least_squares(y_train, train_dataset)
    weights.append(least_squares_w)

    test_rmse = compute_RMSE(y_test, test_dataset, least_squares_w)
    
    train_losses = np.append(train_losses, train_rmse)
    test_losses = np.append(test_losses, test_rmse)
    
    end_time = datetime.datetime.now()
    exection_time = (end_time - start_time).total_seconds()
    print("Lest Squares: execution time={t:.3f} seconds. RMSE Train Loss={l}, Test Loss={tl}".format(t=exection_time, l=train_rmse, tl=test_rmse))

plt.figure(1)
plt.title("Losses of Test Datasets")
plt.plot(range(len(new_train_datasets)), test_losses, marker=".", color='r', label='test error')
plt.xlabel("Datasets")
plt.ylabel("RMSE")
plt.grid(True)
plt.legend()
plt.figure(2)
plt.title("Losses of Train Datasets")
plt.plot(range(len(new_train_datasets)), train_losses, marker=".", color='b', label='train error')
plt.xlabel("Datasets")
plt.ylabel("RMSE")
plt.grid(True)
plt.legend()

In [None]:
least_squares_w, train_rmse = least_squares(y_bin_polynomial_normalized_train, tx_bin_polynomial_normalized_train)

test_rmse = compute_RMSE(y_bin_polynomial_normalized_test, tx_bin_polynomial_normalized_test, least_squares_w)
print(train_rmse, test_rmse)

## Ridge Regression

In [None]:
lambdas = np.logspace(-10, -1, 100)
for i in range(len(new_train_datasets)):
    # Parameters
    train_dataset = new_train_datasets[i]
    test_dataset = new_test_datasets[i]
    dataset_name = new_datasets_names[i]
    figure_id = i
    
    ridge_regression_dataset_lamdas_test(y_train, y_test, train_dataset, test_dataset, lambdas, dataset_name, figure_id)

In [None]:
# Parameters
lamb = 0.9e-3
ridge_regression_gradient_w, train_rmse = ridge_regression(y_polynomial_normalized_train,
                                                           tx_polynomial_normalized_train,
                                                           lamb)
test_rmse = compute_RMSE(y_polynomial_normalized_test,
                         tx_polynomial_normalized_test,
                         ridge_regression_gradient_w)
print(train_rmse)
print(test_rmse)

In [None]:
train_losses = []
test_losses = []
for i in range(len(new_train_datasets)):
    train_dataset = new_train_datasets[i]
    test_dataset = new_test_datasets[i]
    dataset_name = new_datasets_names[i]
    
    start_time = datetime.datetime.now()
    ridge_regression_gradient_w, ridge_regression_loss,  = ridge_regression(y_train, train_dataset, lamb)

    train_losses = np.append(train_losses, ridge_regression_loss)

    test_RMSe = compute_RMSE(y_test, test_dataset, ridge_regression_gradient_w)
    test_losses = np.append(test_losses, test_rmse)
    
    end_time = datetime.datetime.now()
    exection_time = (end_time - start_time).total_seconds()
    print("Ridge Regression for {dn}: execution time={t:.3f} seconds. Test RMSE Loss={l}, Train RMSE Loss={tl}".format(dn = dataset_name, t=exection_time, l=test_rmse, tl=ridge_regression_loss))

plt.figure(1)
plt.title("Losses of Test Datasets")
plt.plot(range(len(new_train_datasets)), test_losses, marker=".", color='r', label='test error')
plt.xlabel("Datasets")
plt.ylabel("RMSE")
plt.grid(True)
plt.legend()
plt.figure(2)
plt.title("Losses of Train Datasets")
plt.plot(range(len(new_train_datasets)), train_losses, marker=".", color='b', label='train error')
plt.xlabel("Datasets")
plt.ylabel("RMSE")
plt.grid(True)
plt.legend()

## Cross validation on best method

In [None]:
from plots import cross_validation_visualization

subset_y = y
subset_tx = tx_polynomial_normalized


# Define the parameters of the algorithm.
seed = 1
k_fold = 10
lambdas = np.logspace(-16, 2, 1)

rmse_tr = []
rmse_te = []
# weights = np.empty((0,subset_tx.shape[1]), float)
start_time = datetime.datetime.now()

for lambd in np.nditer(lambdas):
    loss_tr, loss_te = cross_validation_mat(subset_y, subset_tx, k_fold, seed, lambd)
    rmse_tr = np.append(rmse_tr, loss_tr)
    rmse_te = np.append(rmse_te, loss_te)
#     weights = np.vstack((weights, w))
        
end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()

print("Cross Validation: execution time={t:.3f} seconds.".format(t=exection_time))
#cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
OUTPUT_PATH = '../data/least_squares_polynomial_submission.csv' # TODO: fill in desired name of output file for submission
weights_pred = least_squares_w #for tx_polynomial_normalized_train
y_pred = predict_labels(weights_pred, tx_pred_bin_polynomial_normalized)
create_csv_submission(ids_pred, y_pred, OUTPUT_PATH)