In [69]:
import numpy as np
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission

In [2]:
def compute_mse(y, tx, w):
    """Calculate the loss using mse."""
    N = y.shape[0]
    e = y - tx @ w.T
    return 1 / (2 * N) * np.linalg.norm(e) ** 2

def compute_gradient(y, tx, w):
    """Compute the gradient."""
    N = y.shape[0]
    e = y - tx @ w.T
    return -1 / N * tx.T @ e 

def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    """Least squares using gradient descent algorithm."""
    w = initial_w
    
    for _ in range(max_iters):
        DL = compute_gradient(y, tx, w)        
        w = w - DL * gamma
        
    return w, compute_mse(y, tx, w)

def least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma):
    """Least squares using stochastic gradient descent algorithm."""
    w = initial_w
    
    for _ in range(max_iters):        
        for yn, txn in batch_iter(y, tx, batch_size):
            DL_n = compute_stoch_gradient(yn, txn, w)
            w = w - DL_n * gamma
                        
    return w, compute_mse(y, tx, w)

def least_squares(y, tx):
    """Least squares using normal equation."""
    a = tx.T @ tx
    b = tx.T @ y
    w = np.linalg.solve(a, b)
    return w, compute_mse(y, tx, w)

def ridge_regression(y, tx, lambda_):
    """Ridge regression."""
    N = tx.shape[0]
    a = (tx.T @ tx) + 2 * N * lambda_ * np.eye(tx.shape[1])
    b = tx.T @ y
    w = np.linalg.solve(a, b)
    return w, compute_mse(y, tx, w)

def sigmoid(t):
    """Sigmoid function on t."""
    return 1 / (1 + np.exp(-t))

def calculate_loss(y, tx, w):
    """Cost by negative log likelihood."""
    sigma_tx_w = sigmoid(tx @ w)
    sum_terms = y * np.log(sigma_tx_w) + (1 - y) * np.log(sigma_tx_w)
    return -sum_terms.sum()

def calculate_gradient(y, tx, w):
    """Gradient of loss."""
    tx_w = tx @ w
    sigma_tx_w = sigmoid(tx_w)
    # print('simga_tx_w - y')
    # print(sigma_tx_w - y)
    grad = tx.T @ (sigma_tx_w - y)
    # print('tx.T')
    # print(tx.T.shape)
    return grad

import time

def logistic_regression(y, tx, initial_w, max_iters, gamma):
    '''Logistic regression.'''
    w = initial_w
    
    for _ in range(max_iters):
        # time.sleep(3) 
        grad = calculate_gradient(y, tx, w)
        #print('grad')
        #print(grad)
        # print('w')
        # print(w)
        w = w - gamma * grad
        
    return w, calculate_loss(y, x, w)

def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    w = initial_w
    
    for _ in range(max_iters):
        grad = calculate_gradient(y, tx, w) + lambda_ * np.sum(w)
        w = w - gamma * grad
        
    return w, calculate_loss(y, tx, w)

def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x

In [3]:
def split_data(x, y, ratio, seed=None):
    """
    split the dataset based on the split ratio. If ratio is 0.8 
    you will have 80% of your data set dedicated to training 
    and the rest dedicated to testing
    """
    # set seed
    if not seed is None:
        np.random.seed(seed)
    # ***************************************************
    # INSERT YOUR CODE HERE
    # split the data based on the given ratio: TODO
    # ***************************************************
    d = x.shape[0]
    di = int(d * ratio)
    
    per = np.random.permutation(d)
    
    xtraining = x[per][:di]
    ytraining = y[per][:di]
    xtesting = x[per][di:]
    ytesting = y[per][di:]
    
    return xtraining, ytraining, xtesting, ytesting

In [4]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # polynomial basis function: TODO
    # this function should return the matrix formed
    # by applying the polynomial basis to the input data
    # ***************************************************
    phi = np.ones((x.shape[0], 1))
    for deg in range(1, degree+1):
        phi = np.c_[phi, x ** deg]
    return phi

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [None]:
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # get k'th subgroup in test, others in train: TODO
    # ***************************************************
    train_indices = np.delete(k_indices, k, 0).flatten()
    test_indices = k_indices[k]
    
    x_train = x[train_indices]
    y_train = y[train_indices]
    x_test = x[test_indices]
    y_test = y[test_indices]
    
    # ***************************************************
    # INSERT YOUR CODE HERE
    # form data with polynomial degree: TODO
    # ***************************************************
    phi_train = build_poly(x_train, degree)
    phi_test = build_poly(x_test, degree)
    
    # ***************************************************
    # INSERT YOUR CODE HERE
    # ridge regression: TODO
    # ***************************************************
    w_opt, _ = ridge_regression(y_train, phi_train, lambda_)
    
    # print(w_opt)
    
    # ***************************************************
    # INSERT YOUR CODE HERE
    # calculate the loss for train and test data: TODO
    # ***************************************************
    loss_tr = np.sqrt(2 * compute_mse(y_train, phi_train, w_opt))
    loss_te = np.sqrt(2 * compute_mse(y_test, phi_test, w_opt))
    
    return w_opt, loss_tr, loss_te

In [None]:
from plots import cross_validation_visualization

def cross_validation_demo(y, x):
    seed = 1
    degree = 3
    k_fold = 4
    lambdas = np.logspace(-10, 2, 50)
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    # ***************************************************
    # INSERT YOUR CODE HERE
    # cross validation: TODO
    # ***************************************************
    tr = np.zeros([len(lambdas), k_fold])
    te = np.zeros([len(lambdas), k_fold])
    
    for ind, lambda_ in enumerate(lambdas):
        for k in range(k_fold):
            _, loss_tr, loss_te = cross_validation(y, x, k_indices, k, lambda_, degree)
            tr[ind, k] = loss_tr
            te[ind, k] = loss_te
            
    rmse_tr = np.mean(tr, axis=1)
    rmse_te = np.mean(te, axis=1)
            
    cross_validation_visualization(lambdas, rmse_tr, rmse_te)


------------
# Importing data

In [70]:
# yb, input_data, ids = load_csv_data('C:/Users/Thibaud/Documents/data/train.csv', True)

In [63]:
yb_full, input_data_full, ids_full = load_csv_data('C:/Users/Thibaud/Documents/data/train.csv')

In [64]:
yb_test, input_data_test, ids_test = load_csv_data('C:/Users/Thibaud/Documents/data/test.csv')

In [95]:
yb, input_data, ids = yb_full[::10], input_data_full[::10], ids_full[::10]

In [96]:
yb.shape, input_data.shape, ids.shape

((25000,), (25000, 30), (25000,))

---------------

In [None]:
#cross_validation(yb, input_data, build_k_indices(yb, 5, 3), 0, 0.1, 3)
cross_validation_demo(yb, input_data)

-----

------------------------
# Treating data

In [9]:
def replace_nans_with_mean(arr, nan=-999):
    '''Creates a copy and replaces the nan values by the mean (without those nan values) in the column'''
    N, D = arr.shape
    copy = arr.copy()
    
    for d in range(D):
        copy[:,d][copy[:,d] == nan] = np.mean(arr[:,d][arr[:,d] != nan])
        
    return copy

In [10]:
def replace_nans_with_most_frequent(arr, nan=-999):
    '''Creates a copy and replaces the nan values by the most frequent value in the column'''
    N, D = arr.shape
    copy = arr.copy()
    
    for d in range(D):
        unique, counts = np.unique(arr[:,d], return_counts=True)
        copy[:,d][copy[:,d] == nan] = unique[np.argmax(counts[unique != nan])]
        
    return copy

In [11]:
def replace_nans_with_median(arr, nan=-999):
    '''Creates a copy and replaces the nan values by the median (without thos nan values) in the column'''
    N, D = arr.shape
    copy = arr.copy()
    
    for d in range(D):
        copy[:,d][copy[:,d] == nan] = np.median(arr[:,d][arr[:,d] != nan])
        
    return copy

In [12]:
def prediction(w, x_test, y_test, small=-1, big=1, verbose=False):
    y_pred = x_test @ w
    sep_val = (small + big) / 2
    y_pred[y_pred < sep_val] = small
    y_pred[y_pred >= sep_val] = big
    
    bad = np.count_nonzero(y_pred - y_test)
    good = y_test.shape[0] - bad
    
    ratio = good / (good + bad)
    
    if verbose:
        print('Good: ', good)
        print('Bad: ', bad)
        print('Ratio: ', ratio)
    
    return ratio

### Actually no good ideas $\downarrow$

Nice idea $\downarrow$

In [None]:
no_nans = replace_nans_with_mean(input_data)
no_nans_std, _, _ = standardize(no_nans)

x_train, y_train, x_test, y_test = split_data(no_nans_std, yb, 0.5)

phi = build_poly(x_train, 7)

w, _ = least_squares(y_train, phi)
prediction(w, build_poly(x_test, 7), y_test, verbose=True)

Better idea $\downarrow$

In [None]:
no_nans = replace_nans_with_median(input_data)
no_nans_std, _, _ = standardize(no_nans)

y = (yb + 1) / 2

x_train, y_train, x_test, y_test = split_data(no_nans_std, y, 0.5)

phi = build_poly(x_train, 7)

w, _ = least_squares(y_train, phi)
prediction(w, build_poly(x_test, 7), y_test, small=0, big=1, verbose=True)

Bad idea $\downarrow$

In [None]:
extra_col = add_extra_col(input_data)
no_nans = replace_nans_with_mean(extra_col)
no_nans_std, _, _ = standardize(no_nans)

x_train, y_train, x_test, y_test = split_data(no_nans_std, y, 0.5)

phi = build_poly(x_train, 7)
w, _ = least_squares(y_train, phi)
prediction(w, build_poly(x_test, 7), y_test, verbose=True)

Better idea $\downarrow$

In [None]:
rows_with_nans = np.any(input_data == -999, axis=1).astype(int)
no_nans = replace_nans_with_mean(input_data)
no_nans_std, _, _ = standardize(no_nans)
concat = np.c_[no_nans_std, rows_with_nans]

x_train, y_train, x_test, y_test = split_data(concat, yb, 0.5)

phi = build_poly(x_train, 7)

w, _ = ridge_regression(y_train, phi, 0.00001)
prediction(w, build_poly(x_test, 7), y_test, verbose=True)

-----------

In [None]:
deg_min = 1
deg_max = 9
degrees = np.linspace(deg_min, deg_max, deg_max - deg_min + 1).astype(int)
ratios = []
losses_tr = []
losses_te = []

extra_col = np.any(input_data == -999, axis=1).astype(int)
no_nans = replace_nans_with_median(input_data)
no_nans_std, _, _ = standardize(no_nans)
x_all = np.c_[no_nans_std, extra_col]

x_train, y_train, x_test, y_test = split_data(no_nans, yb, 0.9)

for degree in degrees:
    phi_train = build_poly(x_train, degree)
    phi_test = build_poly(x_test, degree)
    
    w, loss = ridge_regression(y_train, phi_train, 0.00001)
    
    losses_tr.append(np.sqrt(compute_mse(y_train, phi_train, w)))
    losses_te.append(np.sqrt(compute_mse(y_test, phi_test, w)))
    
    ratios.append(prediction(w, phi_test, y_test))
    
sns.plt.plot(degrees, ratios)
sns.plt.show()

In [None]:
sns.plt.plot(degrees, losses_tr)
sns.plt.show()

In [None]:
sns.plt.plot(degrees, losses_te)
sns.plt.show()

In [None]:
best_deg = degrees[np.argmax(ratios)]
degrees[np.argmax(ratios)], np.max(ratios)

In [None]:
lambda_min = -15
lambda_max = 0
lambdas = np.logspace(lambda_min, lambda_max, lambda_max - lambda_min + 1)
ratios = []
losses_tr = []
losses_te = []
'''
extra_col = np.any(input_data == -999, axis=1).astype(int)
no_nans = replace_nans_with_median(input_data)
no_nans_std, _, _ = standardize(no_nans)
x_all = np.c_[no_nans_std, extra_col]

x_train, y_train, x_test, y_test = split_data(no_nans_std, yb, 0.5)
'''
for lambda_ in lambdas:
    phi_train = build_poly(x_train, best_deg)
    phi_test = build_poly(x_test, best_deg)
    
    w, loss = ridge_regression(y_train, phi_train, lambda_)
    
    losses_tr.append(np.sqrt(compute_mse(y_train, phi_train, w)))
    losses_te.append(np.sqrt(compute_mse(y_test, phi_test, w)))
    
    ratios.append(prediction(w, phi_test, y_test))
    
sns.plt.semilogx(lambdas, ratios)
sns.plt.show()

In [None]:
sns.plt.semilogx(lambdas, losses_tr)
sns.plt.show()

In [None]:
sns.plt.semilogx(lambdas, losses_te)
sns.plt.show()

In [None]:
lambdas[np.argmax(ratios)], np.max(ratios)

-------------------

In [None]:
# extra_col = np.any(input_data == -999, axis=1).astype(int)
input_data_clean = replace_nans_with_median(np.delete(input_data, [22, 29], 1))
input_data_std, _, _ = standardize(input_data_clean)

times = 1000
degrees = np.linspace(3, 5, 3).astype(int) # [1, 2, 3]
lambdas = np.logspace(-5, 0, 30) # [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]

rmse_tr = np.zeros([times, len(degrees), len(lambdas)])
rmse_te = np.zeros([times, len(degrees), len(lambdas)])
bests = np.zeros(len(degrees))

for time in range(times):
    x_tr, y_tr, x_te, y_te = split_data(input_data_std, yb, 0.8, seed=time)
    # x_tr, y_tr, x_te, y_te, per, di = split_data_2(input_data_std, yb, 0.8, seed=time)
    
    # extra_col_tr = extra_col[per][:di]
    # extra_col_te = extra_col[per][di:]

    for i, degree in enumerate(degrees):
        phi_tr = build_poly(x_tr, degree)
        phi_te = build_poly(x_te, degree)
        # phi_tr = np.c_[build_poly(x_tr, degree), extra_col_tr]
        # phi_te = np.c_[build_poly(x_te, degree), extra_col_te]

        for j, lambda_ in enumerate(lambdas):
            w, _ = ridge_regression(y_tr, phi_tr, lambda_)

            rmse_tr[time, i, j] = np.sqrt(2 * compute_mse(y_tr, phi_tr, w))
            rmse_te[time, i, j] = np.sqrt(2 * compute_mse(y_te, phi_te, w))
            
    if time == 0.1 * times:
        print('10%')
    if time == 0.2 * times:
        print('20%')
    if time == 0.3 * times:
        print('30%')
    if time == 0.4 * times:
        print('40%')
    if time == 0.5 * times:
        print('50%')
    if time == 0.6 * times:
        print('60%')
    if time == 0.7 * times:
        print('70%')
    if time == 0.8 * times:
        print('80%')
    if time == 0.9 * times:
        print('90%')
        

_, ax = sns.plt.subplots(len(degrees), 1, figsize=(8, len(degrees) * 20 / 3))
        
for i, degree in enumerate(degrees):
    ax[i].semilogx(lambdas, rmse_tr.mean(axis=0)[i], color='b', marker='o')
    ax[i].semilogx(lambdas, rmse_te.mean(axis=0)[i], color='r', marker='o')
    ax[i].set_xlabel("lambda")
    ax[i].set_ylabel("RMSE")
    ax[i].set_title("degree " + str(degree) + " :")
    
    bests[i] = lambdas[np.argmin(rmse_te.mean(axis=0)[i])]
    
sns.plt.show()

In [None]:
bests

-----------

------------
# Separating data

Data separated by the value in column 22 which is a categorical column

In [100]:
def separate_by_col22(x, idd, y):    
    x_22 = [np.delete(x[x[:,22] == i], 22, 1) for i in range(4)]
    idd_22 = [idd[x[:,22] == i] for i in range(4)]
    y_22 = [y[x[:,22] == i] for i in range(4)]
    
    return x_22, idd_22, y_22

In [101]:
input_data_by_22, ids_by_22, yb_by_22 = separate_by_col22(input_data, ids, yb)

Seeing the eprcentages of -999 in each column of the separated data

In [102]:
for i_22 in range(4):
    print(i_22)
    for c in range(input_data_by_22[i_22].shape[1]):
        tmp = input_data_by_22[i_22][:,c]
        
        if np.any(tmp[tmp == -999]):
            print('  ', c, ':', len(tmp[tmp == -999]) / len(tmp))

0
   0 : 0.2672499502883277
   4 : 1.0
   5 : 1.0
   6 : 1.0
   12 : 1.0
   22 : 1.0
   23 : 1.0
   24 : 1.0
   25 : 1.0
   26 : 1.0
   27 : 1.0
1
   0 : 0.10520246027678114
   4 : 1.0
   5 : 1.0
   6 : 1.0
   12 : 1.0
   25 : 1.0
   26 : 1.0
   27 : 1.0
2
   0 : 0.05761895201766713
3
   0 : 0.07324988409828466


In [103]:
def delete_useless_col(x):
    useless_cols = [[4, 5, 6, 12, 22, 23, 24, 25, 26, 27, 28], [4, 5, 6, 12, 25, 26, 27], [], []]
    return [np.delete(x[i], useless_cols[i], 1) for i in range(4)]

In [104]:
only_good_data = delete_useless_col(input_data_by_22)

In [105]:
yb_by_22[0].shape, yb.shape

((10058,), (25000,))

In [107]:
np.any(only_good_data[3] == -999)

True

In [108]:
def pseudo_cross_validation():
    bests = []
    
    for i_22 in range(4):
        print('starting ', i_22)
        input_data_clean = replace_nans_with_median(only_good_data[i_22])
        input_data_std, _, _ = standardize(input_data_clean)

        times = 200
        degrees = np.linspace(6, 13, 8).astype(int) # [1, 2, 3]
        lambdas = np.logspace(-7, -4, 15) # [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]

        rmse_tr = np.zeros([times, len(degrees), len(lambdas)])
        rmse_te = np.zeros([times, len(degrees), len(lambdas)])

        for time in range(times):
            x_tr, y_tr, x_te, y_te = split_data(input_data_std, yb_by_22[i_22], 0.8)

            for i, degree in enumerate(degrees):
                phi_tr = build_poly(x_tr, degree)
                phi_te = build_poly(x_te, degree)

                for j, lambda_ in enumerate(lambdas):
                    try:
                        w, _ = ridge_regression(y_tr, phi_tr, lambda_)

                        rmse_tr[time, i, j] = np.sqrt(2 * compute_mse(y_tr, phi_tr, w))
                        rmse_te[time, i, j] = np.sqrt(2 * compute_mse(y_te, phi_te, w))
                    except:
                        rmse_tr[time, i, j] = 1
                        rmse_te[time, i, j] = 1

            if time == 0.1 * times:
                print('  10%')
            if time == 0.2 * times:
                print('  20%')
            if time == 0.3 * times:
                print('  30%')
            if time == 0.4 * times:
                print('  40%')
            if time == 0.5 * times:
                print('  50%')
            if time == 0.6 * times:
                print('  60%')
            if time == 0.7 * times:
                print('  70%')
            if time == 0.8 * times:
                print('  80%')
            if time == 0.9 * times:
                print('  90%')
            if time == times - 1:
                print(' 100%')
        
        pos = np.unravel_index(np.median(rmse_te, axis=0).argmin(), np.median(rmse_te, axis=0).shape)
        bests.append((degrees[pos[0]], lambdas[pos[1]]))
        
    return bests

In [88]:
pseudo_cross_validation()

starting  0
  10%
  20%
  30%
  40%
  50%
  60%
  70%
  80%
  90%
 100%
starting  1
  10%
  20%
  30%
  40%
  50%
  60%
  70%
  80%
  90%
 100%
starting  2
  10%
  20%
  30%
  40%
  50%
  60%
  70%
  80%
  90%
 100%
starting  3
  10%
  20%
  30%
  40%
  50%
  60%
  70%
  80%
  90%
 100%


[(6, 0.0001),
 (6, 0.0001),
 (6, 7.1968567300115142e-07),
 (6, 9.9999999999999995e-08)]

In [None]:
bests = _

In [None]:
bests

In [14]:
bests = [(2, 0.0038566204211634724), (2, 0.03562247890262444), (3, 0.062101694189156162), (2, 0.32903445623126709)]

In [38]:
bests = [(12, 9e-06), (7, 1.65e-05), (10, 2.42e-06), (8, 4e-05)]

-------------------
# Creating submission

In [15]:
def predict(w, x_test, small=-1, big=1):
    y_pred = x_test @ w
    sep_val = (small + big) / 2
    y_pred[y_pred < sep_val] = small
    y_pred[y_pred >= sep_val] = big
    
    return y_pred

In [24]:
input_data_full_by_22, ids_full_by_22, yb_full_by_22 = separate_by_col22(input_data_full, ids_full, yb_full)
input_data_test_by_22, ids_test_by_22, yb_test_by_22 = separate_by_col22(input_data_test, ids_test, yb_test)

input_data_full_by_22[0].shape, yb_full_by_22[0].shape

((99913, 29), (99913,))

In [51]:
for i_22 in range(4):
    print(i_22)
    for c in range(input_data_full_by_22[i_22].shape[1]):
        tmp = input_data_full_by_22[i_22][:,c]
        
        if np.any(tmp[tmp == -999]):
            print('  ', c, ':', len(tmp[tmp == -999]) / len(tmp))

0
   0 : 0.2614574679971575
   4 : 1.0
   5 : 1.0
   6 : 1.0
   12 : 1.0
   22 : 1.0
   23 : 1.0
   24 : 1.0
   25 : 1.0
   26 : 1.0
   27 : 1.0
1
   0 : 0.09751882802022077
   4 : 1.0
   5 : 1.0
   6 : 1.0
   12 : 1.0
   25 : 1.0
   26 : 1.0
   27 : 1.0
2
   0 : 0.05859584350622283
3
   0 : 0.06663959574084101


In [39]:
only_good_data_full = delete_useless_col(input_data_full_by_22)
only_good_data_test = delete_useless_col(input_data_test_by_22)

for i_22 in range(4):
    full_std, _, _ = standardize(only_good_data_full[i_22])
    test_std, _, _ = standardize(only_good_data_test[i_22])
    
    phi_full = build_poly(full_std, bests[i_22][0])
    phi_test = build_poly(test_std, bests[i_22][0])
    
    w, _ = ridge_regression(yb_full_by_22[i_22], phi_full, bests[i_22][1])
    
    yb_test_by_22[i_22] = predict(w, phi_test)
    
    prediction(w, phi_full, yb_full_by_22[i_22], verbose=True)
    
yb_test_by_22

Good:  81875
Bad:  18038
Ratio:  0.8194629327514938
Good:  60104
Bad:  17440
Ratio:  0.77509542969153
Good:  40810
Bad:  9569
Ratio:  0.8100597471168542
Good:  17548
Bad:  4616
Ratio:  0.7917343439812308


[array([-1., -1.,  1., ..., -1., -1., -1.]),
 array([-1., -1., -1., ..., -1.,  1., -1.]),
 array([-1.,  1., -1., ...,  1.,  1.,  1.]),
 array([-1., -1., -1., ..., -1., -1., -1.])]

In [40]:
yb_submit = np.concatenate(yb_test_by_22)
ids_submit = np.concatenate(ids_test_by_22)

create_csv_submission(ids_submit, yb_submit, 'submission_by_cat.csv')

In [None]:
input_data_clean = replace_nans_with_median(input_data)
input_data_std, _, _ = standardize(input_data_clean)

# y = (yb + 1) / 2

phi = build_poly(input_data_std, 3)

w, _ = ridge_regression(yb, phi, 0.00031)

input_data_clean_test = replace_nans_with_median(input_data_test)
input_data_std_test, _, _ = standardize(input_data_clean_test)

phi_test = build_poly(input_data_std_test, 3)

yb_test = predict(w, phi_test)
create_csv_submission(ids_test, yb_test, 'submission_test.csv')

In [None]:
len(yb_test[yb_test == -1]) / len(yb_test)

In [None]:
len(yb[yb == -1]) / len(yb)

In [None]:
prediction(w, phi, yb, verbose=True)

---------------------
# Plotting some stuff and data analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
sns.set(style="whitegrid", color_codes=True)

In [None]:
copy_no_nans = replace_nans_with_mean(input_data)

In [None]:
et = np.c_[yb, copy_no_nans]

In [None]:
et

In [None]:
df = pd.DataFrame(et[:10000])

In [None]:
df.head()

The columns that are correlated

In [None]:
corr = []
for i in range(0, 31):
    for j in range(i + 1, 31):
        corr.append([i, j, df[i].corr(df[j])])
        
corr_a = np.array(corr)
corr_a[np.logical_or(corr_a[:,2] > 0.8, corr_a[:,2] < -0.8)]

In [None]:
copy_no_nans[copy_no_nans[:,8] > 1000]

In [None]:
for i in range(1, 31):
    #sns.violinplot(x=0, y=i, data=df)
    sns.stripplot(df[0], df[i], jitter=True)
    sns.plt.show()
    #print(df[0].corr(df[i]))

In [None]:
grouped = df.groupby(by=[0]).describe()

In [None]:
grouped.T.to_csv('goruped.csv')

In [None]:
df.groupby(by=[0]).mean()