In [142]:
%matplotlib notebook
import numpy as np
from proj1_helpers import load_csv_data, create_csv_submission
from helpers import *
import matplotlib.pyplot as plt

cheminTrain = 'C:/Users/Raphael/Desktop/EPFL/Machine_Learning/train.csv'
cheminTest =  'C:/Users/Raphael/Desktop/EPFL/Machine_Learning/test.csv'
def compute_mse(y, tx, w):
    return (1/(2*tx.shape[0]))*np.linalg.norm(y-tx@w)**2
def compute_loss_rmse(y,tx,w):
    return (compute_mse(y,tx,w)*2)**0.5
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x
def standardizeNine(x):
    mask = np.any( x == -999, axis=0)
    xWithnine = np.copy(x)
    columnStandardize = (x[:,mask] == -999).astype(int)
    xWithnine[:,mask] = columnStandardize
    return xWithnine
def addColumnNine(x):
    mask = np.any( x == -999, axis=0)
    columnStandardize = (x[:,mask] == -999).astype(int)
    return columnStandardize
    
def build_k_indices(y, k_fold, seed):
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)
def predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0.5)] = -1
    y_pred[np.where(y_pred > 0.5)] = 1
    return y_pred

In [106]:
def ridge_regression(y, tx, lambda_):
    txt = tx.transpose()
    lambda_prim = 2*y.shape[0]*lambda_
    identity = np.identity(tx.shape[1])
    w = np.linalg.inv(txt@tx+lambda_prim*identity)@txt@y
    #w= np.linalg.solve(txt@tx+lambda_prim*identity,txt@y)
    return w

In [107]:
def cross_validation_visualization(lambds, mse_tr, mse_te):

    plt.semilogx(lambds, mse_tr, marker=".", color='b', label='train error')
    plt.semilogx(lambds, mse_te, marker=".", color='r', label='test error')
    plt.xlabel("lambda")
    plt.ylabel("rmse")
    plt.title("cross validation (k_fold = 5): Ridge regression")
    plt.legend(loc=2)
    plt.grid(True)
    plt.savefig("cross_validation")

# IMPORT DATA

In [108]:
# load data.
yb, input_data, ids = load_csv_data(cheminTrain)
yb_test, input_data_test, ids_test = load_csv_data(cheminTest)


print(input_data)



[[ 138.47    51.655   97.827 ...,    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ..., -999.    -999.      46.226]
 [-999.     162.172  125.953 ..., -999.    -999.      44.251]
 ..., 
 [ 105.457   60.526   75.839 ..., -999.    -999.      41.992]
 [  94.951   19.362   68.812 ..., -999.    -999.       0.   ]
 [-999.      72.756   70.831 ..., -999.    -999.       0.   ]]


# Preprocess

In [109]:
#withoutNine = standardizeNine(input_data)
print(input_data.shape)
print(input_data_test.shape)
#whitoutNine_test = standardizeNine(input_data_test)
print(withoutNine)
x,_,_ = standardize(input_data)
x_test,_,_ = standardize(input_data_test)

y = (yb + 1) / 2

(250000, 30)
(568238, 30)
[[   0.      51.655   97.827 ...,    0.       0.     113.497]
 [   0.      68.768  103.235 ...,    1.       1.      46.226]
 [   1.     162.172  125.953 ...,    1.       1.      44.251]
 ..., 
 [   0.      60.526   75.839 ...,    1.       1.      41.992]
 [   0.      19.362   68.812 ...,    1.       1.       0.   ]
 [   1.      72.756   70.831 ...,    1.       1.       0.   ]]
[[-0.42412233  0.06833197  0.40768027 ..., -1.56404344 -1.56404344
   0.4125105 ]
 [-0.42412233  0.55250482  0.54013641 ...,  0.63936843  0.63936843
  -0.27381996]
 [ 2.35781033  3.19515553  1.09655998 ...,  0.63936843  0.63936843
  -0.29396985]
 ..., 
 [-0.42412233  0.31931645 -0.13086367 ...,  0.63936843  0.63936843
  -0.31701723]
 [-0.42412233 -0.84532397 -0.30297338 ...,  0.63936843  0.63936843
  -0.74543941]
 [ 2.35781033  0.66533608 -0.25352276 ...,  0.63936843  0.63936843
  -0.74543941]]


In [129]:
def cross_validation(y, x, k_indices, k, lambda_, degree):

    testLine = k_indices[k]
    k_indices_prim = np.delete(k_indices,k,0).flatten()
    testX = x[testLine]
    testY = y[testLine]
    trainX = x[k_indices_prim]
    trainY = y[k_indices_prim]
    
    traintmpX = build_poly(trainX,degree)
    testmpX = build_poly(testX,degree)
    
    loss, w = ridge_regression(trainY,traintmpX,lambda_)
    loss_tr = loss
    loss_te = compute_mse(testY,testmpX,w)
    
    y_predTest = predict_labels(w, testmpX)
    print(len(y_predTest[y_predTest == -1]) / len(y_predTest))
    
    return w, loss_tr, loss_te

In [130]:
def build_poly(x, degree):
    y = np.copy(x)
    for i in range(1,degree):
        y = np.concatenate((y,np.power(x,i+1)),axis=1)
                           
    return y

In [151]:
a = np.array([[1,2,3],[2,2,2]])
build_poly(a,2)
b = np.array([[-999,2,3],[-999,2,2],[3,4,-999]])

mask = np.any( b == -999, axis=0)
print(mask)
df3 = b[:,mask]
for i in range(df3.shape[1]):
    test = df3[:,i]
    test = test[test != -999]
    print(np.mean(test))
    print(test)
    res = np.concatenate((res,test[tes]))
#print(df3)
#print(df3)
#columnStandardize = (x[:,mask] == -999).astype(int)
#xWithnine[:,mask] = columnStandardize

[ True False  True]
3.0
[3]
2.5
[3 2]


In [138]:

def cross_validation_demo():
    seed = 1
    degree = 2
    k_fold = 5
    xx = np.concatenate(x,addColumnNine(input_data))
    
    lambdas = np.logspace(-8, -5, 5)
    
    k_indices = build_k_indices(y, k_fold, seed)
    
    rmse_tr = []
    rmse_te = []

    for ind, lambda_ in enumerate(lambdas):
        loss_tr_total = 0;
        loss_te_total = 0;
        for k in range(k_fold) :
            w,loss_tr,loss_te = cross_validation(y, x, k_indices, k, lambda_, degree)
            loss_tr_total += loss_tr
            loss_te_total += loss_te

        rmse_tr.append(loss_tr_total/k_fold)
        rmse_te.append(loss_te_total/k_fold)
        print("Current lambda = {i}".format(i=lambda_))

    print(rmse_tr)
    print(rmse_te)
    cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [139]:

cross_validation_demo()

0.71156
0.71548
0.7089
0.7104
0.71034
Current lambda = 1e-08
0.71156
0.71548
0.7089
0.7104
0.71034
Current lambda = 5.6234132519034905e-08
0.71156
0.71548
0.7089
0.7104
0.71034
Current lambda = 3.162277660168379e-07
0.71156
0.71548
0.7089
0.71042
0.71034
Current lambda = 1.778279410038923e-06
0.7116
0.71552
0.7089
0.71048
0.71034
Current lambda = 1e-05
[0.082977550626703242, 0.082977551226212268, 0.082977551314610154, 0.082977551339928388, 0.082977551631743279]
[0.097073948665109627, 0.097073951366720396, 0.097073992156195804, 0.097074224939329073, 0.097075535376539207]


<IPython.core.display.Javascript object>

In [121]:
lambda_ = 0.00001
degree = 3
traintmpX = build_poly(x[:,0:3],degree)
testmpX = build_poly(x_test[:,0:3],degree)
loss, w = ridge_regression(y,traintmpX,lambda_)
y_pred = predict_labels(w, testmpX)
create_csv_submission(ids_test, y_pred, 'submission_for_kaggle.csv')

In [122]:
len(y_pred[y_pred == -1]) / len(y_pred)

0.7387784695849274

In [124]:
len(yb[yb == -1]) / len(yb)

0.657332

In [125]:
y_pred.shape

(568238,)