In [67]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import random
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
# Import files to use in preprocessing and machine learning
from implementations import *
from proj1_helpers import *
from preprocess import *
from cross_validation import *

## Load the training data into feature matrix, class labels, and event ids:

In [69]:
# Download train data and supply path here 
DATA_TRAIN_PATH = '../data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# Initial Data Analysis

In observing the original training data, we found out that there exists missing data all over tX. The missing data are represented as value -999. Considering that these columns are critical in model training, we cannot simply delete these rows with -999 values. Therefore, we need to process the original training set before model training.

Firstly, we check the columns of tX to obtain an overview of missing data:

We can see that 11 columns contains at least one -999 (missing value). Now we check whether some of the missing values are dependent on the column named 'PRI_jet_num' (column No. 23), since 'PRI_jet_num' has a discrete value range {0, 1, 2, 3} and our observation on the beginning data rows showed a dependency of some missing values to the value of 'PRI_jet_num' column.

The above analysis showed that one column with -999 (missing value) is independent of the column 'PRI_jet_num', we check the original training set and we can easily find out that the first tX column 'DER_mass_MMC' is independent of 'PRI_jet_num'. 

# Data Preprocessing

Based on the data analysis above, we conduct the following method to pre-process the training data. 

In [70]:
# Split the database based on 'PRI_jet_num' (0, 1, 2&3)
tX_jet_0, tX_jet_1, tX_jet_23, y_jet_0, y_jet_1, y_jet_23, r_ids = split_reformat_data(tX, y, ids)

print(tX_jet_0.shape)
print(tX_jet_1.shape)
print(tX_jet_23.shape)

(99913, 18)
(77544, 22)
(72543, 29)


In [71]:
def k_means_cluster(data, y, k, max_iter=20):
    data = np.asarray(data, np.float32)
    indices = np.random.randint(0,data.shape[0],(1,k)).tolist()
    #print(indices)
    center = np.copy(data[indices])
    cluster = np.zeros(data.shape[0])
    for i in range(0,max_iter):
        one_hot1 = np.zeros(k*data.shape[0], np.float32)
        distance = np.sqrt(np.sum(np.square(np.expand_dims(data, axis=1) - center), axis=2))
        cluster = np.argmin(distance, axis=1)
        one_hot1[np.argmin(distance, axis=1) + np.arange(data.shape[0]) * k] = 1.
        one_hot2 = np.reshape(one_hot1, (data.shape[0], k))
        center = np.matmul(np.transpose(one_hot2, (1, 0)), data) / np.expand_dims(np.sum(one_hot2, axis=0), axis=1)     
    class_y = np.zeros((2, np.asarray(np.argmin(distance, axis=1)).shape[0]))
    class_y[0] = np.asarray(cluster)
    class_y[1] = np.asarray(y[0].T)[:,0]
    y_center = []
    for i in range(k):
        y_center.append(np.mean(class_y[1][class_y[0,:]==i]))
    return cluster, center, np.array(y_center)

def k_means_replacing(xx, k=11):
    normal_x = np.mat(np.delete(xx[xx[:, 0] != -999], np.r_[0], axis=1)) 
    abnormal_x = np.mat(np.delete(xx[xx[:, 0] == -999], np.r_[0], axis=1))
    normal_y = np.mat(xx[xx[:, 0]!= -999][:,0])
    cluster, center, y_center = k_means_cluster(normal_x,normal_y, k)
    replace_list = np.zeros((abnormal_x.shape[0],1))
    for j in range(abnormal_x.shape[0]):
        #if j%100 == 0:
            #print('Replacing '+str(j)+' out of'+str(abnormal_x.shape[0]))       
        tt = np.zeros((center.shape[0],center.shape[1]))
        for i in range(center.shape[0]):
            tt[i] = abnormal_x[j]
        ff = np.mat(tt-center)
        distance_matrix = np.array((ff*ff.T).diagonal()).T
        class_num = np.argmin(distance_matrix)
        replace_list[j] = y_center[class_num]
    replace_list = np.c_[replace_list,abnormal_x]
    xx[np.where(xx[:, 0]== -999)[0]] = replace_list

# Model Training

## Ridge Regression

In [72]:
def find_optimal(x, y, degrees, k_fold, lambdas, k_clusters, seed=1):
    # Split the data into k-fold
    k_indices = build_k_indices(y, k_fold, seed)
    x_k = x.copy()
    # Set lists for collecting best lambda & rmse for each degree
    best_lambda = []
    best_rmse = []
    best_k = []
    best_degree = []
    
    for k_mean in k_clusters:
        x_k = x.copy()
        k_cluster = 10*k_mean+5
        k_means_replacing(x_k, k_cluster)
        x_k = normalize(standardize(x_k))
        for degree in degrees:
            rmse_val = []
        
            for lambda_ in lambdas:
                rmse_val_lambda_ = []
            
                for k in range(k_fold):
                    _, loss_val, w = cross_validation(y, x_k, k_indices, k, lambda_, degree)
                    rmse_val_lambda_.append(loss_val)
                
                print("No. of clusters {}".format(k_cluster))
                print("lambda {}".format(lambda_))
                
                print("loss {}".format(np.mean(rmse_val_lambda_)))
                print("degree {}".format(degree))
                print("\n\n")

                rmse_val.append(np.mean(rmse_val_lambda_))
        
            index_opt_lambda = np.argmin(rmse_val)
            best_lambda.append(lambdas[index_opt_lambda])
            best_rmse.append(rmse_val[index_opt_lambda])
            best_k.append(k_mean)
            best_degree.append(degree)

    opt_degree = best_degree[np.argmin(best_rmse)]
    opt_lambda = best_lambda[np.argmin(best_rmse)]
    opt_k = best_k[np.argmin(best_rmse)]
    
    return opt_degree, opt_lambda, opt_k

In [73]:
ls_opt_d = []
ls_opt_l = []
ls_opt_K = []

t0, t1, t2 = tX_jet_0.copy(), tX_jet_1.copy(), tX_jet_23.copy()
y0, y1, y2 = y_jet_0.copy(), y_jet_1.copy(), y_jet_23.copy()

train_set = [[t0, y0]]
# Set H-parameters
K_FOLD = 10
DEGREE = np.arange(4, 9)
k_clusters= np.arange(1,11)
SEED = 5
LAMBDA = np.logspace(-8, -3, 40)

for tx, y in train_set:
    opt_d, opt_l, opt_k= find_optimal(tx, y, DEGREE, K_FOLD, LAMBDA, k_clusters)
    print(opt_d, opt_l, opt_k)
    ls_opt_d.append(opt_d)
    ls_opt_l.append(opt_l)
    ls_opt_K.append(opt_k)

  center = np.copy(data[indices])


No. of clusters 15
lambda 1e-08
loss 0.7198256188927804
degree 4



No. of clusters 15
lambda 1.3433993325988987e-08
loss 0.7132177918659051
degree 4



No. of clusters 15
lambda 1.8047217668271702e-08
loss 0.7088594087496005
degree 4



No. of clusters 15
lambda 2.424462017082331e-08
loss 0.7061031186458768
degree 4



No. of clusters 15
lambda 3.257020655659783e-08
loss 0.7044474396006415
degree 4



No. of clusters 15
lambda 4.37547937507418e-08
loss 0.7035204441437626
degree 4



No. of clusters 15
lambda 5.878016072274912e-08
loss 0.7030570329470291
degree 4



No. of clusters 15
lambda 7.896522868499733e-08
loss 0.7028786627445214
degree 4



No. of clusters 15
lambda 1.0608183551394483e-07
loss 0.702874885314395
degree 4



No. of clusters 15
lambda 1.4251026703029963e-07
loss 0.7029858400235247
degree 4



No. of clusters 15
lambda 1.9144819761699575e-07
loss 0.7031863247470874
degree 4



No. of clusters 15
lambda 2.571913809059347e-07
loss 0.7034723570981701
degree 4



No. o

No. of clusters 15
lambda 2.7283333764867697e-06
loss 0.7010882167536268
degree 6



No. of clusters 15
lambda 3.665241237079626e-06
loss 0.7014246348796015
degree 6



No. of clusters 15
lambda 4.923882631706732e-06
loss 0.7018472601157901
degree 6



No. of clusters 15
lambda 6.6147406412301455e-06
loss 0.7023676097410497
degree 6



No. of clusters 15
lambda 8.886238162743407e-06
loss 0.7029881367137607
degree 6



No. of clusters 15
lambda 1.1937766417144358e-05
loss 0.7037014472726152
degree 6



No. of clusters 15
lambda 1.6037187437513277e-05
loss 0.7044925050836155
degree 6



No. of clusters 15
lambda 2.1544346900318823e-05
loss 0.7053431038802868
degree 6



No. of clusters 15
lambda 2.8942661247167517e-05
loss 0.7062369374803094
degree 6



No. of clusters 15
lambda 3.888155180308085e-05
loss 0.7071637546004285
degree 6



No. of clusters 15
lambda 5.223345074266833e-05
loss 0.7081219019070856
degree 6



No. of clusters 15
lambda 7.017038286703822e-05
loss 0.709119225185962

No. of clusters 15
lambda 0.000554102033000948
loss 0.7122986091446917
degree 8



No. of clusters 15
lambda 0.000744380301325168
loss 0.7139893546724605
degree 8



No. of clusters 15
lambda 0.001
loss 0.715922104496192
degree 8



No. of clusters 25
lambda 1e-08
loss 0.7203960254831578
degree 4



No. of clusters 25
lambda 1.3433993325988987e-08
loss 0.7137041367889669
degree 4



No. of clusters 25
lambda 1.8047217668271702e-08
loss 0.7092640482227449
degree 4



No. of clusters 25
lambda 2.424462017082331e-08
loss 0.7064408901174859
degree 4



No. of clusters 25
lambda 3.257020655659783e-08
loss 0.704738736659844
degree 4



No. of clusters 25
lambda 4.37547937507418e-08
loss 0.7037850656413314
degree 4



No. of clusters 25
lambda 5.878016072274912e-08
loss 0.7033105994581945
degree 4



No. of clusters 25
lambda 7.896522868499733e-08
loss 0.7031316619461307
degree 4



No. of clusters 25
lambda 1.0608183551394483e-07
loss 0.7031334107064391
degree 4



No. of clusters 25
lambda 

No. of clusters 25
lambda 1.1253355826007646e-06
loss 0.7005085906972085
degree 6



No. of clusters 25
lambda 1.51177507061566e-06
loss 0.700710558454347
degree 6



No. of clusters 25
lambda 2.030917620904735e-06
loss 0.7009435271055718
degree 6



No. of clusters 25
lambda 2.7283333764867697e-06
loss 0.7012262672094269
degree 6



No. of clusters 25
lambda 3.665241237079626e-06
loss 0.7015792019048479
degree 6



No. of clusters 25
lambda 4.923882631706732e-06
loss 0.702020287785162
degree 6



No. of clusters 25
lambda 6.6147406412301455e-06
loss 0.702560727524723
degree 6



No. of clusters 25
lambda 8.886238162743407e-06
loss 0.7032018336725441
degree 6



No. of clusters 25
lambda 1.1937766417144358e-05
loss 0.7039345371716471
degree 6



No. of clusters 25
lambda 1.6037187437513277e-05
loss 0.7047421002028761
degree 6



No. of clusters 25
lambda 2.1544346900318823e-05
loss 0.7056050481192127
degree 6



No. of clusters 25
lambda 2.8942661247167517e-05
loss 0.7065064285907408
d

No. of clusters 25
lambda 0.00022854638641349884
loss 0.7085539236082393
degree 8



No. of clusters 25
lambda 0.00030702906297578496
loss 0.709725617266749
degree 8



No. of clusters 25
lambda 0.00041246263829013477
loss 0.7110223453287352
degree 8



No. of clusters 25
lambda 0.000554102033000948
loss 0.7124873670188074
degree 8



No. of clusters 25
lambda 0.000744380301325168
loss 0.7141627499275134
degree 8



No. of clusters 25
lambda 0.001
loss 0.7160803135214727
degree 8



No. of clusters 35
lambda 1e-08
loss 0.7220397132927114
degree 4



No. of clusters 35
lambda 1.3433993325988987e-08
loss 0.7149930229667489
degree 4



No. of clusters 35
lambda 1.8047217668271702e-08
loss 0.7102353188998606
degree 4



No. of clusters 35
lambda 2.424462017082331e-08
loss 0.707150704143531
degree 4



No. of clusters 35
lambda 3.257020655659783e-08
loss 0.7052510013035609
degree 4



No. of clusters 35
lambda 4.37547937507418e-08
loss 0.7041602553079023
degree 4



No. of clusters 35
lambd

No. of clusters 35
lambda 4.6415888336127725e-07
loss 0.7002353986686741
degree 6



No. of clusters 35
lambda 6.235507341273912e-07
loss 0.7003889666462816
degree 6



No. of clusters 35
lambda 8.376776400682924e-07
loss 0.7005623800646763
degree 6



No. of clusters 35
lambda 1.1253355826007646e-06
loss 0.7007495785522456
degree 6



No. of clusters 35
lambda 1.51177507061566e-06
loss 0.7009548426122614
degree 6



No. of clusters 35
lambda 2.030917620904735e-06
loss 0.7011905161052641
degree 6



No. of clusters 35
lambda 2.7283333764867697e-06
loss 0.7014741680429722
degree 6



No. of clusters 35
lambda 3.665241237079626e-06
loss 0.7018250621791818
degree 6



No. of clusters 35
lambda 4.923882631706732e-06
loss 0.7022599644630665
degree 6



No. of clusters 35
lambda 6.6147406412301455e-06
loss 0.7027889122012747
degree 6



No. of clusters 35
lambda 8.886238162743407e-06
loss 0.7034122833822017
degree 6



No. of clusters 35
lambda 1.1937766417144358e-05
loss 0.7041205910223387


No. of clusters 35
lambda 9.426684551178853e-05
loss 0.7055896797625426
degree 8



No. of clusters 35
lambda 0.00012663801734674022
loss 0.7065132855304046
degree 8



No. of clusters 35
lambda 0.00017012542798525856
loss 0.707484917006194
degree 8



No. of clusters 35
lambda 0.00022854638641349884
loss 0.7085173930071462
degree 8



No. of clusters 35
lambda 0.00030702906297578496
loss 0.7096367179477585
degree 8



No. of clusters 35
lambda 0.00041246263829013477
loss 0.7108806111625536
degree 8



No. of clusters 35
lambda 0.000554102033000948
loss 0.7122939192434391
degree 8



No. of clusters 35
lambda 0.000744380301325168
loss 0.713921207414983
degree 8



No. of clusters 35
lambda 0.001
loss 0.7157975862371467
degree 8



No. of clusters 45
lambda 1e-08
loss 0.7208239539794612
degree 4



No. of clusters 45
lambda 1.3433993325988987e-08
loss 0.7141105884558097
degree 4



No. of clusters 45
lambda 1.8047217668271702e-08
loss 0.7096333963813504
degree 4



No. of clusters 45
la

No. of clusters 45
lambda 1.9144819761699575e-07
loss 0.700149078129843
degree 6



No. of clusters 45
lambda 2.571913809059347e-07
loss 0.7001069973205838
degree 6



No. of clusters 45
lambda 3.4551072945922185e-07
loss 0.7001724893359877
degree 6



No. of clusters 45
lambda 4.6415888336127725e-07
loss 0.7003000997843165
degree 6



No. of clusters 45
lambda 6.235507341273912e-07
loss 0.700459515418906
degree 6



No. of clusters 45
lambda 8.376776400682924e-07
loss 0.7006340416326282
degree 6



No. of clusters 45
lambda 1.1253355826007646e-06
loss 0.7008189673451924
degree 6



No. of clusters 45
lambda 1.51177507061566e-06
loss 0.7010196011511575
degree 6



No. of clusters 45
lambda 2.030917620904735e-06
loss 0.7012489749714713
degree 6



No. of clusters 45
lambda 2.7283333764867697e-06
loss 0.7015251382807687
degree 6



No. of clusters 45
lambda 3.665241237079626e-06
loss 0.7018678430687116
degree 6



No. of clusters 45
lambda 4.923882631706732e-06
loss 0.7022945595007744
de

No. of clusters 45
lambda 3.888155180308085e-05
loss 0.7032121297117271
degree 8



No. of clusters 45
lambda 5.223345074266833e-05
loss 0.7039674911332722
degree 8



No. of clusters 45
lambda 7.017038286703822e-05
loss 0.7047893779954524
degree 8



No. of clusters 45
lambda 9.426684551178853e-05
loss 0.705666997341378
degree 8



No. of clusters 45
lambda 0.00012663801734674022
loss 0.706592530836067
degree 8



No. of clusters 45
lambda 0.00017012542798525856
loss 0.7075657539386275
degree 8



No. of clusters 45
lambda 0.00022854638641349884
loss 0.7085976497968964
degree 8



No. of clusters 45
lambda 0.00030702906297578496
loss 0.7097119053511853
degree 8



No. of clusters 45
lambda 0.00041246263829013477
loss 0.7109437189192465
degree 8



No. of clusters 45
lambda 0.000554102033000948
loss 0.7123358343833821
degree 8



No. of clusters 45
lambda 0.000744380301325168
loss 0.7139320610207474
degree 8



No. of clusters 45
lambda 0.001
loss 0.7157690126084012
degree 8



No. of 

No. of clusters 55
lambda 7.896522868499733e-08
loss 0.7018303174929823
degree 6



No. of clusters 55
lambda 1.0608183551394483e-07
loss 0.7009765754823076
degree 6



No. of clusters 55
lambda 1.4251026703029963e-07
loss 0.7004818071707865
degree 6



No. of clusters 55
lambda 1.9144819761699575e-07
loss 0.7002396263381042
degree 6



No. of clusters 55
lambda 2.571913809059347e-07
loss 0.7001693755506742
degree 6



No. of clusters 55
lambda 3.4551072945922185e-07
loss 0.7002104256278059
degree 6



No. of clusters 55
lambda 4.6415888336127725e-07
loss 0.7003183418941009
degree 6



No. of clusters 55
lambda 6.235507341273912e-07
loss 0.7004628912481943
degree 6



No. of clusters 55
lambda 8.376776400682924e-07
loss 0.7006268446593477
degree 6



No. of clusters 55
lambda 1.1253355826007646e-06
loss 0.7008047074458752
degree 6



No. of clusters 55
lambda 1.51177507061566e-06
loss 0.7010010361759299
degree 6



No. of clusters 55
lambda 2.030917620904735e-06
loss 0.7012282887679593

No. of clusters 55
lambda 1.6037187437513277e-05
loss 0.7014534738699676
degree 8



No. of clusters 55
lambda 2.1544346900318823e-05
loss 0.7019662386734657
degree 8



No. of clusters 55
lambda 2.8942661247167517e-05
loss 0.7025587432525542
degree 8



No. of clusters 55
lambda 3.888155180308085e-05
loss 0.7032309499312455
degree 8



No. of clusters 55
lambda 5.223345074266833e-05
loss 0.7039770525957871
degree 8



No. of clusters 55
lambda 7.017038286703822e-05
loss 0.7047870311841861
degree 8



No. of clusters 55
lambda 9.426684551178853e-05
loss 0.7056497838073231
degree 8



No. of clusters 55
lambda 0.00012663801734674022
loss 0.7065574191167842
degree 8



No. of clusters 55
lambda 0.00017012542798525856
loss 0.707509795842904
degree 8



No. of clusters 55
lambda 0.00022854638641349884
loss 0.7085181032820957
degree 8



No. of clusters 55
lambda 0.00030702906297578496
loss 0.7096064383775389
degree 8



No. of clusters 55
lambda 0.00041246263829013477
loss 0.71081080192713

No. of clusters 65
lambda 3.257020655659783e-08
loss 0.7080693960863462
degree 6



No. of clusters 65
lambda 4.37547937507418e-08
loss 0.7052451242834136
degree 6



No. of clusters 65
lambda 5.878016072274912e-08
loss 0.7032602730336883
degree 6



No. of clusters 65
lambda 7.896522868499733e-08
loss 0.7019362872544729
degree 6



No. of clusters 65
lambda 1.0608183551394483e-07
loss 0.7011065382094921
degree 6



No. of clusters 65
lambda 1.4251026703029963e-07
loss 0.7006335507430207
degree 6



No. of clusters 65
lambda 1.9144819761699575e-07
loss 0.7004105977181577
degree 6



No. of clusters 65
lambda 2.571913809059347e-07
loss 0.7003566885873104
degree 6



No. of clusters 65
lambda 3.4551072945922185e-07
loss 0.700410991448029
degree 6



No. of clusters 65
lambda 4.6415888336127725e-07
loss 0.7005290886429131
degree 6



No. of clusters 65
lambda 6.235507341273912e-07
loss 0.7006809556900586
degree 6



No. of clusters 65
lambda 8.376776400682924e-07
loss 0.7008496474024278
d

No. of clusters 65
lambda 6.6147406412301455e-06
loss 0.7005328178592608
degree 8



No. of clusters 65
lambda 8.886238162743407e-06
loss 0.7008575754407148
degree 8



No. of clusters 65
lambda 1.1937766417144358e-05
loss 0.7012312820777462
degree 8



No. of clusters 65
lambda 1.6037187437513277e-05
loss 0.7016666012000387
degree 8



No. of clusters 65
lambda 2.1544346900318823e-05
loss 0.7021736633700446
degree 8



No. of clusters 65
lambda 2.8942661247167517e-05
loss 0.7027582550682662
degree 8



No. of clusters 65
lambda 3.888155180308085e-05
loss 0.7034205815274464
degree 8



No. of clusters 65
lambda 5.223345074266833e-05
loss 0.7041551725834727
degree 8



No. of clusters 65
lambda 7.017038286703822e-05
loss 0.7049523311841812
degree 8



No. of clusters 65
lambda 9.426684551178853e-05
loss 0.7058011891600098
degree 8



No. of clusters 65
lambda 0.00012663801734674022
loss 0.7066939600516775
degree 8



No. of clusters 65
lambda 0.00017012542798525856
loss 0.70763045544801

No. of clusters 75
lambda 1.3433993325988987e-08
loss 0.722479230773311
degree 6



No. of clusters 75
lambda 1.8047217668271702e-08
loss 0.7167045658280857
degree 6



No. of clusters 75
lambda 2.424462017082331e-08
loss 0.7118619522240465
degree 6



No. of clusters 75
lambda 3.257020655659783e-08
loss 0.7080527189209351
degree 6



No. of clusters 75
lambda 4.37547937507418e-08
loss 0.7052234442265047
degree 6



No. of clusters 75
lambda 5.878016072274912e-08
loss 0.7032287722793301
degree 6



No. of clusters 75
lambda 7.896522868499733e-08
loss 0.7018930565815168
degree 6



No. of clusters 75
lambda 1.0608183551394483e-07
loss 0.7010513612136435
degree 6



No. of clusters 75
lambda 1.4251026703029963e-07
loss 0.7005672764173978
degree 6



No. of clusters 75
lambda 1.9144819761699575e-07
loss 0.700334883835772
degree 6



No. of clusters 75
lambda 2.571913809059347e-07
loss 0.7002738036053981
degree 6



No. of clusters 75
lambda 3.4551072945922185e-07
loss 0.7003235198730586
d

No. of clusters 75
lambda 2.7283333764867697e-06
loss 0.6996083869342453
degree 8



No. of clusters 75
lambda 3.665241237079626e-06
loss 0.6998637468165583
degree 8



No. of clusters 75
lambda 4.923882631706732e-06
loss 0.7001324145728139
degree 8



No. of clusters 75
lambda 6.6147406412301455e-06
loss 0.700424719385254
degree 8



No. of clusters 75
lambda 8.886238162743407e-06
loss 0.7007531959196949
degree 8



No. of clusters 75
lambda 1.1937766417144358e-05
loss 0.7011313156402387
degree 8



No. of clusters 75
lambda 1.6037187437513277e-05
loss 0.7015718249736052
degree 8



No. of clusters 75
lambda 2.1544346900318823e-05
loss 0.7020848027365841
degree 8



No. of clusters 75
lambda 2.8942661247167517e-05
loss 0.7026758079241562
degree 8



No. of clusters 75
lambda 3.888155180308085e-05
loss 0.703344683115032
degree 8



No. of clusters 75
lambda 5.223345074266833e-05
loss 0.7040855741896624
degree 8



No. of clusters 75
lambda 7.017038286703822e-05
loss 0.7048885109512861


No. of clusters 85
lambda 0.000554102033000948
loss 0.724263708041917
degree 5



No. of clusters 85
lambda 0.000744380301325168
loss 0.7268650040417393
degree 5



No. of clusters 85
lambda 0.001
loss 0.7296854243990285
degree 5



No. of clusters 85
lambda 1e-08
loss 0.728085034383571
degree 6



No. of clusters 85
lambda 1.3433993325988987e-08
loss 0.7217643122707278
degree 6



No. of clusters 85
lambda 1.8047217668271702e-08
loss 0.7161015759739046
degree 6



No. of clusters 85
lambda 2.424462017082331e-08
loss 0.7113724238791086
degree 6



No. of clusters 85
lambda 3.257020655659783e-08
loss 0.7076678692976721
degree 6



No. of clusters 85
lambda 4.37547937507418e-08
loss 0.7049285382305845
degree 6



No. of clusters 85
lambda 5.878016072274912e-08
loss 0.7030075688811588
degree 6



No. of clusters 85
lambda 7.896522868499733e-08
loss 0.7017307967224203
degree 6



No. of clusters 85
lambda 1.0608183551394483e-07
loss 0.700935944292444
degree 6



No. of clusters 85
lambda 1

No. of clusters 85
lambda 1.1253355826007646e-06
loss 0.698880785702126
degree 8



No. of clusters 85
lambda 1.51177507061566e-06
loss 0.699127298915257
degree 8



No. of clusters 85
lambda 2.030917620904735e-06
loss 0.6993783117720073
degree 8



No. of clusters 85
lambda 2.7283333764867697e-06
loss 0.6996323109498024
degree 8



No. of clusters 85
lambda 3.665241237079626e-06
loss 0.6998926742185669
degree 8



No. of clusters 85
lambda 4.923882631706732e-06
loss 0.7001666521300804
degree 8



No. of clusters 85
lambda 6.6147406412301455e-06
loss 0.7004643732693643
degree 8



No. of clusters 85
lambda 8.886238162743407e-06
loss 0.7007978206910861
degree 8



No. of clusters 85
lambda 1.1937766417144358e-05
loss 0.7011796379910159
degree 8



No. of clusters 85
lambda 1.6037187437513277e-05
loss 0.7016216577226857
degree 8



No. of clusters 85
lambda 2.1544346900318823e-05
loss 0.702133190305121
degree 8



No. of clusters 85
lambda 2.8942661247167517e-05
loss 0.7027193491432
degr

No. of clusters 95
lambda 0.00022854638641349884
loss 0.7181135918106796
degree 5



No. of clusters 95
lambda 0.00030702906297578496
loss 0.7199102717644966
degree 5



No. of clusters 95
lambda 0.00041246263829013477
loss 0.7219833559288821
degree 5



No. of clusters 95
lambda 0.000554102033000948
loss 0.7243408930069608
degree 5



No. of clusters 95
lambda 0.000744380301325168
loss 0.7269601547360043
degree 5



No. of clusters 95
lambda 0.001
loss 0.7297890754917999
degree 5



No. of clusters 95
lambda 1e-08
loss 0.7287137913183386
degree 6



No. of clusters 95
lambda 1.3433993325988987e-08
loss 0.7224182070701787
degree 6



No. of clusters 95
lambda 1.8047217668271702e-08
loss 0.7167581063896649
degree 6



No. of clusters 95
lambda 2.424462017082331e-08
loss 0.7120138610557
degree 6



No. of clusters 95
lambda 3.257020655659783e-08
loss 0.7082811671831392
degree 6



No. of clusters 95
lambda 4.37547937507418e-08
loss 0.70550438859118
degree 6



No. of clusters 95
lambda 5

No. of clusters 95
lambda 4.6415888336127725e-07
loss 0.6986530196908411
degree 8



No. of clusters 95
lambda 6.235507341273912e-07
loss 0.69880185995793
degree 8



No. of clusters 95
lambda 8.376776400682924e-07
loss 0.698995122133708
degree 8



No. of clusters 95
lambda 1.1253355826007646e-06
loss 0.6992148720293172
degree 8



No. of clusters 95
lambda 1.51177507061566e-06
loss 0.6994483253937488
degree 8



No. of clusters 95
lambda 2.030917620904735e-06
loss 0.699688260100747
degree 8



No. of clusters 95
lambda 2.7283333764867697e-06
loss 0.6999326373680317
degree 8



No. of clusters 95
lambda 3.665241237079626e-06
loss 0.7001838487087537
degree 8



No. of clusters 95
lambda 4.923882631706732e-06
loss 0.7004479382737896
degree 8



No. of clusters 95
lambda 6.6147406412301455e-06
loss 0.700733902664009
degree 8



No. of clusters 95
lambda 8.886238162743407e-06
loss 0.701052940593478
degree 8



No. of clusters 95
lambda 1.1937766417144358e-05
loss 0.701417429728251
degree 

No. of clusters 105
lambda 9.426684551178853e-05
loss 0.7139289714083648
degree 5



No. of clusters 105
lambda 0.00012663801734674022
loss 0.7150971977019448
degree 5



No. of clusters 105
lambda 0.00017012542798525856
loss 0.7164192589204481
degree 5



No. of clusters 105
lambda 0.00022854638641349884
loss 0.7179397649437907
degree 5



No. of clusters 105
lambda 0.00030702906297578496
loss 0.7197048887769295
degree 5



No. of clusters 105
lambda 0.00041246263829013477
loss 0.7217511045060955
degree 5



No. of clusters 105
lambda 0.000554102033000948
loss 0.7240921181539203
degree 5



No. of clusters 105
lambda 0.000744380301325168
loss 0.7267095630081783
degree 5



No. of clusters 105
lambda 0.001
loss 0.7295527173992425
degree 5



No. of clusters 105
lambda 1e-08
loss 0.7293488332173383
degree 6



No. of clusters 105
lambda 1.3433993325988987e-08
loss 0.722767944524094
degree 6



No. of clusters 105
lambda 1.8047217668271702e-08
loss 0.7168589392414443
degree 6



No. of c

No. of clusters 105
lambda 1.4251026703029963e-07
loss 0.6985737573841831
degree 8



No. of clusters 105
lambda 1.9144819761699575e-07
loss 0.6983089042819068
degree 8



No. of clusters 105
lambda 2.571913809059347e-07
loss 0.6981853431845221
degree 8



No. of clusters 105
lambda 3.4551072945922185e-07
loss 0.6981815676456131
degree 8



No. of clusters 105
lambda 4.6415888336127725e-07
loss 0.6982724098305707
degree 8



No. of clusters 105
lambda 6.235507341273912e-07
loss 0.6984320637998351
degree 8



No. of clusters 105
lambda 8.376776400682924e-07
loss 0.6986368049322449
degree 8



No. of clusters 105
lambda 1.1253355826007646e-06
loss 0.698867287807456
degree 8



No. of clusters 105
lambda 1.51177507061566e-06
loss 0.6991100004063241
degree 8



No. of clusters 105
lambda 2.030917620904735e-06
loss 0.6993576558925458
degree 8



No. of clusters 105
lambda 2.7283333764867697e-06
loss 0.6996087113688783
degree 8



No. of clusters 105
lambda 3.665241237079626e-06
loss 0.69986

## Logistic Regression

In [77]:
print(ls_opt_d, ls_opt_l,ls_opt_K)
opt_d_0 = 8
opt_d_1 = 8
opt_d_23 = 8

[8] [3.4551072945922185e-07] [2]


In [47]:
opt_l_0 = 3.3932217718953295e-07
opt_l_1 = 5.1794746792312124e-08
opt_l_23 = 1e-08

In [54]:
accuracy = []
op_lambda = []

In [None]:
# [8] [3.4551072945922185e-07] [2] for jet = 0

In [None]:
k_means_replacing(x_k, 21)
x_k = normalize(standardize(x_k))

In [57]:
i = 23
tx, opt_d, y_jet, opt_l = tx_23, opt_d_23, y_jet_23, opt_l_23
poly_tx_te = build_poly(tx, opt_d)
quanzhong, _ = ridge_regression(y_jet, poly_tx_te, opt_l)
p = predict_labels(quanzhong, poly_tx_te)
print('Accuracy for jet = '+ str(i))
accuracy.append(np.sum(p==y_jet))
op_lambda.append(opt_l)
print(accuracy, len(y_jet))
print(np.sum(p==y_jet)/len(y_jet))


Accuracy for jet = 23
[82954, 60476, 58330] 72543
0.8040748245867968


In [58]:
#k mean cluster 11
kmc1_acc = accuracy
kmc1_lambda = op_lambda
print(sum(kmc_acc)/250000)
print(kmc1_acc)
print(kmc1_lambda)

0.805564
[82954, 60476, 58330]
[3.3932217718953295e-07, 5.1794746792312124e-08, 1e-08]


In [22]:
#linear Regression

#kmc_acc = accuracy
#kmc_lambda = op_lambda
print(sum(kmc_acc)/250000)
print(kmc_acc)
print(kmc_lambda)

0.805564
[82891, 60519, 57981]
[1.67683293681101e-07, 1e-08, 1e-08]


In [338]:
# meaningless
#regression2_acc = accuracy 
#mei you normalize
#regression2_lambda = op_lambda
print(sum(regression2_acc)/250000)
print(regression2_acc)
print(regression2_lambda)

0.795996
[81711, 59334, 57954]
[0.0007880462815669912, 2.811768697974231e-06, 1.030917620904739e-10]


In [None]:
mean_acc = accuracy
mean_lambda = op_lambda
print(sum(mean_acc)/250000)
print(mean_acc)
print(mean_lambda)

In [129]:
#median_acc = accuracy
#median_lambda = op_lambda
print(sum(median_acc)/250000)
print(median_acc)
print(median_lambda)

0.802208
[81614, 60623, 58315]
[0.0014873521072935117, 0.0014873521072935117, 0.0028072162039411755]


In [118]:
#median2_acc = accuracy 80.2
#median2_lambda = op_lambda
print(sum(median2_acc)/250000)
print(median2_acc)
print(median2_lambda)

0.801596
[81614, 60558, 58227]
[0.0014873521072935117, 0.01, 0.007278953843983146]


## Generate predictions and save ouput in csv format for submission:

In [74]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
# OUTPUT_PATH = 'data/pred.csv' # TODO: fill in desired name of output file for submission
# y_pred = predict_labels(weights, tX_test)
# create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [75]:
t0, t1, t23, t_ids = split_reformat_test(tX_test, ids_test)


print(t0.shape)
print(t1.shape)
print(t23.shape)

(227458, 18)
(175338, 22)
(165442, 29)


In [154]:
len(quanzhong)

111

In [158]:
poly_t1 = build_poly(t1, opt_d)


p = predict_labels(quanzhong, poly_t1)

In [163]:
len(p[p == 1]) / len(p) 

0.1306904378970902

In [164]:
len(y_jet_1[y_jet_1 == 1]) / len(y_jet_1) 

0.35734550706695556