In [40]:
import numpy as np
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
from helpers import *

cheminTrain = 'C:/Users/Raphael/Desktop/EPFL/Machine_Learning/train.csv'
cheminTest =  'C:/Users/Raphael/Desktop/EPFL/Machine_Learning/test.csv'
def compute_mse(y, tx, w):
    return (1/(2*tx.shape[0]))*np.linalg.norm(y-tx@w)**2
def compute_loss_rmse(y,tx,w):
    return (compute_mse(y,tx,w)*2)**0.5
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x
def standardizeNine(x):
    mask = np.any( x == -999, axis=0)
    xWithnine = np.copy(x)
    columnStandardize = (x[:,mask] == -999).astype(int)
    xWithnine[:,mask] = columnStandardize
    return xWithnine
def build_k_indices(y, k_fold, seed):
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [18]:
def ridge_regression(y, tx, lambda_):
    txt = tx.transpose()
    lambda_prim = 2*y.shape[0]*lambda_
    identity = np.identity(tx.shape[1])
    w = np.linalg.inv(txt@tx+lambda_prim*identity)@txt@y
    #w= np.linalg.solve(txt@tx+lambda_prim*identity,txt@y)
    loss = compute_mse(y,tx,w)
    return loss,w

# IMPORT DATA

In [38]:
# load data.
yb, input_data, ids = load_csv_data(cheminTrain)
yb_test, input_data_test, ids_test = load_csv_data(cheminTest)


print(input_data)



[[ 138.47    51.655   97.827 ...,    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ..., -999.    -999.      46.226]
 [-999.     162.172  125.953 ..., -999.    -999.      44.251]
 ..., 
 [ 105.457   60.526   75.839 ..., -999.    -999.      41.992]
 [  94.951   19.362   68.812 ..., -999.    -999.       0.   ]
 [-999.      72.756   70.831 ..., -999.    -999.       0.   ]]


# Preprocess

In [39]:
withoutNine = standardizeNine(input_data)
print(input_data.shape)
print(input_data_test.shape)
whitoutNine_test = standardizeNine(input_data_test)
print(withoutNine)
x,_,_ = standardize(withoutNine)
x_test,_,_ = standardize(whitoutNine_test)
print(x)

y = (yb + 1) / 2

(250000, 30)
(568238, 30)
[[   0.      51.655   97.827 ...,    0.       0.     113.497]
 [   0.      68.768  103.235 ...,    1.       1.      46.226]
 [   1.     162.172  125.953 ...,    1.       1.      44.251]
 ..., 
 [   0.      60.526   75.839 ...,    1.       1.      41.992]
 [   0.      19.362   68.812 ...,    1.       1.       0.   ]
 [   1.      72.756   70.831 ...,    1.       1.       0.   ]]
[[-0.42412233  0.06833197  0.40768027 ..., -1.56404344 -1.56404344
   0.4125105 ]
 [-0.42412233  0.55250482  0.54013641 ...,  0.63936843  0.63936843
  -0.27381996]
 [ 2.35781033  3.19515553  1.09655998 ...,  0.63936843  0.63936843
  -0.29396985]
 ..., 
 [-0.42412233  0.31931645 -0.13086367 ...,  0.63936843  0.63936843
  -0.31701723]
 [-0.42412233 -0.84532397 -0.30297338 ...,  0.63936843  0.63936843
  -0.74543941]
 [ 2.35781033  0.66533608 -0.25352276 ...,  0.63936843  0.63936843
  -0.74543941]]


In [88]:
def cross_validation(y, x, k_indices, k, lambda_, degree):

    testLine = k_indices[k]
    k_indices_prim = np.delete(k_indices,k,0).flatten()
    testX = x[testLine]
    testY = y[testLine]
    trainX = x[k_indices_prim]
    trainY = y[k_indices_prim]
    
    #traintmpX = build_poly(trainX,degree)
    #testmpX = build_poly(testX,degree)
    
    loss, w = ridge_regression(trainY,trainX,lambda_)
    loss_tr = compute_mse(trainY,trainX,w)
    loss_te = compute_mse(testY,testX,w)
    
    return w, loss_tr, loss_te

In [85]:
def build_poly(x, degree):
    
    tmp = np.tile(x,(degree+1,1)).transpose()
    r = range(degree+1)
    return np.power(tmp,r)

In [86]:
a = np.array([1,2,3])
build_poly(a,3)

array([[ 1,  1,  1,  1],
       [ 1,  2,  4,  8],
       [ 1,  3,  9, 27]], dtype=int32)

In [89]:
from plots import cross_validation_visualization

def cross_validation_demo():
    seed = 1
    degree = 7
    k_fold = 4
    lambdas = np.logspace(-4, 0, 30)
    
    k_indices = build_k_indices(y, k_fold, seed)
    
    rmse_tr = []
    rmse_te = []

    for ind, lambda_ in enumerate(lambdas):
        loss_tr_total = 0;
        loss_te_total = 0;
        for k in range(k_fold) :
            w,loss_tr,loss_te = cross_validation(y, x, k_indices, k, lambda_, degree)
            loss_tr_total += loss_tr
            loss_te_total += loss_te

        rmse_tr.append(loss_tr_total/k_fold)
        rmse_te.append(loss_te_total/k_fold)

    cross_validation_visualization(lambdas, rmse_tr, rmse_te)

cross_validation_demo()