In [27]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import random
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
# Import files to use in preprocessing and machine learning
from implementations import *
from proj1_helpers import *
from preprocess import *
from cross_validation import *

## Load the training data into feature matrix, class labels, and event ids:

In [29]:
# Download train data and supply path here 
DATA_TRAIN_PATH = '../data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# Initial Data Analysis

In observing the original training data, we found out that there exists missing data all over tX. The missing data are represented as value -999. Considering that these columns are critical in model training, we cannot simply delete these rows with -999 values. Therefore, we need to process the original training set before model training.

Firstly, we check the columns of tX to obtain an overview of missing data:

We can see that 11 columns contains at least one -999 (missing value). Now we check whether some of the missing values are dependent on the column named 'PRI_jet_num' (column No. 23), since 'PRI_jet_num' has a discrete value range {0, 1, 2, 3} and our observation on the beginning data rows showed a dependency of some missing values to the value of 'PRI_jet_num' column.

The above analysis showed that one column with -999 (missing value) is independent of the column 'PRI_jet_num', we check the original training set and we can easily find out that the first tX column 'DER_mass_MMC' is independent of 'PRI_jet_num'. 

# Data Preprocessing

Based on the data analysis above, we conduct the following method to pre-process the training data. 

In [30]:
# Split the database based on 'PRI_jet_num' (0, 1, 2&3)
tX_jet_0, tX_jet_1, tX_jet_23, y_jet_0, y_jet_1, y_jet_23, r_ids = split_reformat_data(tX, y, ids)

print(tX_jet_0.shape)
print(tX_jet_1.shape)
print(tX_jet_23.shape)

(99913, 18)
(77544, 22)
(72543, 29)


In [None]:
def k_means_cluster(data, y, k, max_iter=20):
    data = np.asarray(data, np.float32)
    indices = np.random.randint(0,data.shape[0],(1,k)).tolist()
    #print(indices)
    center = np.copy(data[indices])
    cluster = np.zeros(data.shape[0])
    for i in range(0,max_iter):
        one_hot1 = np.zeros(k*data.shape[0], np.float32)
        distance = np.sqrt(np.sum(np.square(np.expand_dims(data, axis=1) - center), axis=2))
        cluster = np.argmin(distance, axis=1)
        one_hot1[np.argmin(distance, axis=1) + np.arange(data.shape[0]) * k] = 1.
        one_hot2 = np.reshape(one_hot1, (data.shape[0], k))
        center = np.matmul(np.transpose(one_hot2, (1, 0)), data) / np.expand_dims(np.sum(one_hot2, axis=0), axis=1)     
    class_y = np.zeros((2, np.asarray(np.argmin(distance, axis=1)).shape[0]))
    class_y[0] = np.asarray(cluster)
    class_y[1] = np.asarray(y[0].T)[:,0]
    y_center = []
    for i in range(k):
        y_center.append(np.mean(class_y[1][class_y[0,:]==i]))
    return cluster, center, np.array(y_center)

def k_means_replacing(xx, k=11):
    normal_x = np.mat(np.delete(xx[xx[:, 0] != -999], np.r_[0], axis=1)) 
    abnormal_x = np.mat(np.delete(xx[xx[:, 0] == -999], np.r_[0], axis=1))
    normal_y = np.mat(xx[xx[:, 0]!= -999][:,0])
    cluster, center, y_center = k_means_cluster(normal_x,normal_y, k)
    replace_list = np.zeros((abnormal_x.shape[0],1))
    for j in range(abnormal_x.shape[0]):
        #if j%100 == 0:
            #print('Replacing '+str(j)+' out of'+str(abnormal_x.shape[0]))       
        tt = np.zeros((center.shape[0],center.shape[1]))
        for i in range(center.shape[0]):
            tt[i] = abnormal_x[j]
        ff = np.mat(tt-center)
        distance_matrix = np.array((ff*ff.T).diagonal()).T
        class_num = np.argmin(distance_matrix)
        replace_list[j] = y_center[class_num]
    replace_list = np.c_[replace_list,abnormal_x]
    xx[np.where(xx[:, 0]== -999)[0]] = replace_list

# Model Training

## Ridge Regression

In [62]:
def find_optimal(x, y, degrees, k_fold, lambdas, k_clusters, seed=1):
    # Split the data into k-fold
    k_indices = build_k_indices(y, k_fold, seed)
    x_k = x.copy()
    # Set lists for collecting best lambda & rmse for each degree
    best_lambda = []
    best_rmse = []
    best_k = []
    best_degree = []
    
    for k_mean in k_clusters:
        x_k = x.copy()
        k_cluster = 10*k_mean+5
        k_means_replacing(x_k, k_cluster)
        x_k = normalize(standardize(x_k))
        for degree in degrees:
            rmse_val = []
        
            for lambda_ in lambdas:
                rmse_val_lambda_ = []
            
                for k in range(k_fold):
                    _, loss_val, w = cross_validation(y, x_k, k_indices, k, lambda_, degree)
                    rmse_val_lambda_.append(loss_val)
                
                print("No. of clusters {}".format(k_cluster))
                print("lambda {}".format(lambda_))
                
                print("loss {}".format(np.mean(rmse_val_lambda_)))
                print("degree {}".format(degree))
                print("\n\n")

                rmse_val.append(np.mean(rmse_val_lambda_))
        
            index_opt_lambda = np.argmin(rmse_val)
            best_lambda.append(lambdas[index_opt_lambda])
            best_rmse.append(rmse_val[index_opt_lambda])
            best_k.append(k_mean)
            best_degree.append(degree)

    opt_degree = best_degree[np.argmin(best_rmse)]
    opt_lambda = best_lambda[np.argmin(best_rmse)]
    opt_k = best_k[np.argmin(best_rmse)]
    
    return opt_degree, opt_lambda, opt_k

In [64]:
ls_opt_d = []
ls_opt_l = []
ls_opt_K = []

t0, t1, t2 = tX_jet_0.copy(), tX_jet_1.copy(), tX_jet_23.copy()
y0, y1, y2 = y_jet_0.copy(), y_jet_1.copy(), y_jet_23.copy()

train_set = [[t0, y0],[t1, y1],[t2, y2]]
# Set H-parameters
K_FOLD = 10
DEGREE = np.arange(4, 9)
k_clusters= np.arange(1,11)
SEED = 5
LAMBDA = np.logspace(-8, -3, 40)

for tx, y in train_set:
    opt_d, opt_l, opt_k= find_optimal(tx, y, DEGREE, K_FOLD, LAMBDA, k_clusters)
    print(opt_d, opt_l, opt_k)
    ls_opt_d.append(opt_d)
    ls_opt_l.append(opt_l)
    ls_opt_K.append(opt_k)

No. of clusters 15
lambda 1e-08
loss 0.7902881266033355
degree 4



No. of clusters 15
lambda 1.3433993325988987e-08
loss 0.7901464701702299
degree 4



No. of clusters 15
lambda 1.8047217668271702e-08
loss 0.7900094605427315
degree 4



No. of clusters 15
lambda 2.424462017082331e-08
loss 0.7898871586624165
degree 4



No. of clusters 15
lambda 3.257020655659783e-08
loss 0.7897896000357715
degree 4



No. of clusters 15
lambda 4.37547937507418e-08
loss 0.789725351517262
degree 4



No. of clusters 15
lambda 5.878016072274912e-08
loss 0.789700603466853
degree 4



No. of clusters 15
lambda 7.896522868499733e-08
loss 0.7897193295602778
degree 4



No. of clusters 15
lambda 1.0608183551394483e-07
loss 0.7897845712718852
degree 4



No. of clusters 15
lambda 1.4251026703029963e-07
loss 0.7899003925467503
degree 4



No. of clusters 15
lambda 1.9144819761699575e-07
loss 0.7900738914664809
degree 4



No. of clusters 15
lambda 2.571913809059347e-07
loss 0.7903167865351367
degree 4



No. of

No. of clusters 15
lambda 2.7283333764867697e-06
loss 0.7839862579975352
degree 6



No. of clusters 15
lambda 3.665241237079626e-06
loss 0.7846060135429059
degree 6



No. of clusters 15
lambda 4.923882631706732e-06
loss 0.7853274603923143
degree 6



No. of clusters 15
lambda 6.6147406412301455e-06
loss 0.7861706371502596
degree 6



No. of clusters 15
lambda 8.886238162743407e-06
loss 0.7871427676420998
degree 6



No. of clusters 15
lambda 1.1937766417144358e-05
loss 0.7882354304506434
degree 6



No. of clusters 15
lambda 1.6037187437513277e-05
loss 0.789425663558277
degree 6



No. of clusters 15
lambda 2.1544346900318823e-05
loss 0.7906807332673619
degree 6



No. of clusters 15
lambda 2.8942661247167517e-05
loss 0.7919651511308304
degree 6



No. of clusters 15
lambda 3.888155180308085e-05
loss 0.7932484353092658
degree 6



No. of clusters 15
lambda 5.223345074266833e-05
loss 0.794512269044303
degree 6



No. of clusters 15
lambda 7.017038286703822e-05
loss 0.7957554650198263


No. of clusters 15
lambda 0.000554102033000948
loss 0.7968453655284883
degree 8



No. of clusters 15
lambda 0.000744380301325168
loss 0.7987237220416477
degree 8



No. of clusters 15
lambda 0.001
loss 0.8008154747353433
degree 8



No. of clusters 25
lambda 1e-08
loss 0.7899485537462958
degree 4



No. of clusters 25
lambda 1.3433993325988987e-08
loss 0.7898092467850979
degree 4



No. of clusters 25
lambda 1.8047217668271702e-08
loss 0.789674060052889
degree 4



No. of clusters 25
lambda 2.424462017082331e-08
loss 0.7895530827303343
degree 4



No. of clusters 25
lambda 3.257020655659783e-08
loss 0.7894564623199049
degree 4



No. of clusters 25
lambda 4.37547937507418e-08
loss 0.7893928909647363
degree 4



No. of clusters 25
lambda 5.878016072274912e-08
loss 0.7893686290869475
degree 4



No. of clusters 25
lambda 7.896522868499733e-08
loss 0.7893876375712925
degree 4



No. of clusters 25
lambda 1.0608183551394483e-07
loss 0.7894528848884683
degree 4



No. of clusters 25
lambda

No. of clusters 25
lambda 1.1253355826007646e-06
loss 0.7822663803365472
degree 6



No. of clusters 25
lambda 1.51177507061566e-06
loss 0.7827340117623931
degree 6



No. of clusters 25
lambda 2.030917620904735e-06
loss 0.7832209074574713
degree 6



No. of clusters 25
lambda 2.7283333764867697e-06
loss 0.783753022572168
degree 6



No. of clusters 25
lambda 3.665241237079626e-06
loss 0.7843592589257763
degree 6



No. of clusters 25
lambda 4.923882631706732e-06
loss 0.7850671061509358
degree 6



No. of clusters 25
lambda 6.6147406412301455e-06
loss 0.7858970233832746
degree 6



No. of clusters 25
lambda 8.886238162743407e-06
loss 0.7868570209184189
degree 6



No. of clusters 25
lambda 1.1937766417144358e-05
loss 0.7879395841171339
degree 6



No. of clusters 25
lambda 1.6037187437513277e-05
loss 0.7891224417264981
degree 6



No. of clusters 25
lambda 2.1544346900318823e-05
loss 0.7903730853569744
degree 6



No. of clusters 25
lambda 2.8942661247167517e-05
loss 0.7916557382847543

No. of clusters 25
lambda 0.00022854638641349884
loss 0.7920416757444512
degree 8



No. of clusters 25
lambda 0.00030702906297578496
loss 0.7933655055743541
degree 8



No. of clusters 25
lambda 0.00041246263829013477
loss 0.7948396868121045
degree 8



No. of clusters 25
lambda 0.000554102033000948
loss 0.7965034507103936
degree 8



No. of clusters 25
lambda 0.000744380301325168
loss 0.7983815413260976
degree 8



No. of clusters 25
lambda 0.001
loss 0.8004835952630911
degree 8



No. of clusters 35
lambda 1e-08
loss 0.7902303157829563
degree 4



No. of clusters 35
lambda 1.3433993325988987e-08
loss 0.7900944346979044
degree 4



No. of clusters 35
lambda 1.8047217668271702e-08
loss 0.7899622103925376
degree 4



No. of clusters 35
lambda 2.424462017082331e-08
loss 0.789843172544898
degree 4



No. of clusters 35
lambda 3.257020655659783e-08
loss 0.7897469093863259
degree 4



No. of clusters 35
lambda 4.37547937507418e-08
loss 0.7896817051473003
degree 4



No. of clusters 35
lamb

No. of clusters 35
lambda 4.6415888336127725e-07
loss 0.7808903323787214
degree 6



No. of clusters 35
lambda 6.235507341273912e-07
loss 0.7814064408388913
degree 6



No. of clusters 35
lambda 8.376776400682924e-07
loss 0.781901720662852
degree 6



No. of clusters 35
lambda 1.1253355826007646e-06
loss 0.7823813604261285
degree 6



No. of clusters 35
lambda 1.51177507061566e-06
loss 0.7828588058230874
degree 6



No. of clusters 35
lambda 2.030917620904735e-06
loss 0.7833548483431417
degree 6



No. of clusters 35
lambda 2.7283333764867697e-06
loss 0.783895890383787
degree 6



No. of clusters 35
lambda 3.665241237079626e-06
loss 0.7845110587220603
degree 6



No. of clusters 35
lambda 4.923882631706732e-06
loss 0.7852278331794054
degree 6



No. of clusters 35
lambda 6.6147406412301455e-06
loss 0.7860664934880168
degree 6



No. of clusters 35
lambda 8.886238162743407e-06
loss 0.787034801700891
degree 6



No. of clusters 35
lambda 1.1937766417144358e-05
loss 0.7881250092663442
deg

No. of clusters 35
lambda 0.00012663801734674022
loss 0.7898634574743401
degree 8



No. of clusters 35
lambda 0.00017012542798525856
loss 0.7910281006261
degree 8



No. of clusters 35
lambda 0.00022854638641349884
loss 0.7922476562917896
degree 8



No. of clusters 35
lambda 0.00030702906297578496
loss 0.7935648842676508
degree 8



No. of clusters 35
lambda 0.00041246263829013477
loss 0.7950264188882366
degree 8



No. of clusters 35
lambda 0.000554102033000948
loss 0.7966725624616743
degree 8



No. of clusters 35
lambda 0.000744380301325168
loss 0.7985299384833485
degree 8



No. of clusters 35
lambda 0.001
loss 0.8006105291346464
degree 8



No. of clusters 45
lambda 1e-08
loss 0.7904983798040008
degree 4



No. of clusters 45
lambda 1.3433993325988987e-08
loss 0.7903554764926821
degree 4



No. of clusters 45
lambda 1.8047217668271702e-08
loss 0.7902162016020663
degree 4



No. of clusters 45
lambda 2.424462017082331e-08
loss 0.7900906060954351
degree 4



No. of clusters 45
lam

No. of clusters 45
lambda 2.571913809059347e-07
loss 0.7798920841249032
degree 6



No. of clusters 45
lambda 3.4551072945922185e-07
loss 0.780453032484524
degree 6



No. of clusters 45
lambda 4.6415888336127725e-07
loss 0.7810070516164628
degree 6



No. of clusters 45
lambda 6.235507341273912e-07
loss 0.7815429494286578
degree 6



No. of clusters 45
lambda 8.376776400682924e-07
loss 0.7820571387209199
degree 6



No. of clusters 45
lambda 1.1253355826007646e-06
loss 0.7825545285878209
degree 6



No. of clusters 45
lambda 1.51177507061566e-06
loss 0.7830482987507388
degree 6



No. of clusters 45
lambda 2.030917620904735e-06
loss 0.7835588841276457
degree 6



No. of clusters 45
lambda 2.7283333764867697e-06
loss 0.784112185614811
degree 6



No. of clusters 45
lambda 3.665241237079626e-06
loss 0.7847366958132553
degree 6



No. of clusters 45
lambda 4.923882631706732e-06
loss 0.7854592139843406
degree 6



No. of clusters 45
lambda 6.6147406412301455e-06
loss 0.7862994441491495
de

No. of clusters 45
lambda 5.223345074266833e-05
loss 0.7866353124829893
degree 8



No. of clusters 45
lambda 7.017038286703822e-05
loss 0.7877531145801925
degree 8



No. of clusters 45
lambda 9.426684551178853e-05
loss 0.7888826613455477
degree 8



No. of clusters 45
lambda 0.00012663801734674022
loss 0.7900212174821125
degree 8



No. of clusters 45
lambda 0.00017012542798525856
loss 0.7911823275432193
degree 8



No. of clusters 45
lambda 0.00022854638641349884
loss 0.7923961347377451
degree 8



No. of clusters 45
lambda 0.00030702906297578496
loss 0.7937050772692993
degree 8



No. of clusters 45
lambda 0.00041246263829013477
loss 0.7951554138000521
degree 8



No. of clusters 45
lambda 0.000554102033000948
loss 0.7967873076620189
degree 8



No. of clusters 45
lambda 0.000744380301325168
loss 0.7986277394047003
degree 8



No. of clusters 45
lambda 0.001
loss 0.8006896053526068
degree 8



No. of clusters 55
lambda 1e-08
loss 0.7905420919893049
degree 4



No. of clusters 55
la

No. of clusters 55
lambda 1.0608183551394483e-07
loss 0.778543644283943
degree 6



No. of clusters 55
lambda 1.4251026703029963e-07
loss 0.7790238358742395
degree 6



No. of clusters 55
lambda 1.9144819761699575e-07
loss 0.7795455659896335
degree 6



No. of clusters 55
lambda 2.571913809059347e-07
loss 0.780093136914118
degree 6



No. of clusters 55
lambda 3.4551072945922185e-07
loss 0.7806487356314185
degree 6



No. of clusters 55
lambda 4.6415888336127725e-07
loss 0.7811963298995087
degree 6



No. of clusters 55
lambda 6.235507341273912e-07
loss 0.7817252181363069
degree 6



No. of clusters 55
lambda 8.376776400682924e-07
loss 0.7822322615947066
degree 6



No. of clusters 55
lambda 1.1253355826007646e-06
loss 0.7827226654110598
degree 6



No. of clusters 55
lambda 1.51177507061566e-06
loss 0.7832096690018443
degree 6



No. of clusters 55
lambda 2.030917620904735e-06
loss 0.7837134900178413
degree 6



No. of clusters 55
lambda 2.7283333764867697e-06
loss 0.7842595612050454


No. of clusters 55
lambda 2.1544346900318823e-05
loss 0.7836920337417126
degree 8



No. of clusters 55
lambda 2.8942661247167517e-05
loss 0.7846546768389135
degree 8



No. of clusters 55
lambda 3.888155180308085e-05
loss 0.7856854955372455
degree 8



No. of clusters 55
lambda 5.223345074266833e-05
loss 0.7867636766532182
degree 8



No. of clusters 55
lambda 7.017038286703822e-05
loss 0.7878681170657108
degree 8



No. of clusters 55
lambda 9.426684551178853e-05
loss 0.788983945296693
degree 8



No. of clusters 55
lambda 0.00012663801734674022
loss 0.790108423689689
degree 8



No. of clusters 55
lambda 0.00017012542798525856
loss 0.7912548634849118
degree 8



No. of clusters 55
lambda 0.00022854638641349884
loss 0.7924530696375623
degree 8



No. of clusters 55
lambda 0.00030702906297578496
loss 0.7937452276215365
degree 8



No. of clusters 55
lambda 0.00041246263829013477
loss 0.7951776506028889
degree 8



No. of clusters 55
lambda 0.000554102033000948
loss 0.796791047340797
d

No. of clusters 65
lambda 4.37547937507418e-08
loss 0.7773659046019874
degree 6



No. of clusters 65
lambda 5.878016072274912e-08
loss 0.7776747054739823
degree 6



No. of clusters 65
lambda 7.896522868499733e-08
loss 0.7780407528284539
degree 6



No. of clusters 65
lambda 1.0608183551394483e-07
loss 0.7784638729247992
degree 6



No. of clusters 65
lambda 1.4251026703029963e-07
loss 0.7789394873818636
degree 6



No. of clusters 65
lambda 1.9144819761699575e-07
loss 0.7794574670504211
degree 6



No. of clusters 65
lambda 2.571913809059347e-07
loss 0.7800027654232036
degree 6



No. of clusters 65
lambda 3.4551072945922185e-07
loss 0.7805580785255155
degree 6



No. of clusters 65
lambda 4.6415888336127725e-07
loss 0.7811076624321839
degree 6



No. of clusters 65
lambda 6.235507341273912e-07
loss 0.7816408935842986
degree 6



No. of clusters 65
lambda 8.376776400682924e-07
loss 0.7821545640861076
degree 6



No. of clusters 65
lambda 1.1253355826007646e-06
loss 0.7826537235249026

No. of clusters 65
lambda 1.1937766417144358e-05
loss 0.7819892746627587
degree 8



No. of clusters 65
lambda 1.6037187437513277e-05
loss 0.7827918176096212
degree 8



No. of clusters 65
lambda 2.1544346900318823e-05
loss 0.7836828086780807
degree 8



No. of clusters 65
lambda 2.8942661247167517e-05
loss 0.784656526478203
degree 8



No. of clusters 65
lambda 3.888155180308085e-05
loss 0.7856981505408801
degree 8



No. of clusters 65
lambda 5.223345074266833e-05
loss 0.7867869804810766
degree 8



No. of clusters 65
lambda 7.017038286703822e-05
loss 0.7879020366281371
degree 8



No. of clusters 65
lambda 9.426684551178853e-05
loss 0.7890285086955732
degree 8



No. of clusters 65
lambda 0.00012663801734674022
loss 0.7901636337938005
degree 8



No. of clusters 65
lambda 0.00017012542798525856
loss 0.7913206326012561
degree 8



No. of clusters 65
lambda 0.00022854638641349884
loss 0.7925291941638128
degree 8



No. of clusters 65
lambda 0.00030702906297578496
loss 0.79383139990935

No. of clusters 75
lambda 2.424462017082331e-08
loss 0.7768919868659389
degree 6



No. of clusters 75
lambda 3.257020655659783e-08
loss 0.7770941396129676
degree 6



No. of clusters 75
lambda 4.37547937507418e-08
loss 0.777349774948118
degree 6



No. of clusters 75
lambda 5.878016072274912e-08
loss 0.7776612145040691
degree 6



No. of clusters 75
lambda 7.896522868499733e-08
loss 0.7780298962016206
degree 6



No. of clusters 75
lambda 1.0608183551394483e-07
loss 0.7784547527966311
degree 6



No. of clusters 75
lambda 1.4251026703029963e-07
loss 0.7789304389623799
degree 6



No. of clusters 75
lambda 1.9144819761699575e-07
loss 0.7794462951201072
degree 6



No. of clusters 75
lambda 2.571913809059347e-07
loss 0.7799870177222561
degree 6



No. of clusters 75
lambda 3.4551072945922185e-07
loss 0.7805352858438545
degree 6



No. of clusters 75
lambda 4.6415888336127725e-07
loss 0.7810755062813971
degree 6



No. of clusters 75
lambda 6.235507341273912e-07
loss 0.7815973044674647
d

No. of clusters 75
lambda 6.6147406412301455e-06
loss 0.7805849814811094
degree 8



No. of clusters 75
lambda 8.886238162743407e-06
loss 0.7812124105566617
degree 8



No. of clusters 75
lambda 1.1937766417144358e-05
loss 0.781915672102601
degree 8



No. of clusters 75
lambda 1.6037187437513277e-05
loss 0.7827047889726535
degree 8



No. of clusters 75
lambda 2.1544346900318823e-05
loss 0.7835835726093121
degree 8



No. of clusters 75
lambda 2.8942661247167517e-05
loss 0.7845466538409032
degree 8



No. of clusters 75
lambda 3.888155180308085e-05
loss 0.7855793220897839
degree 8



No. of clusters 75
lambda 5.223345074266833e-05
loss 0.7866607549361795
degree 8



No. of clusters 75
lambda 7.017038286703822e-05
loss 0.787769725600092
degree 8



No. of clusters 75
lambda 9.426684551178853e-05
loss 0.7888912048335551
degree 8



No. of clusters 75
lambda 0.00012663801734674022
loss 0.7900223658023925
degree 8



No. of clusters 75
lambda 0.00017012542798525856
loss 0.7911765755678957

No. of clusters 85
lambda 1.3433993325988987e-08
loss 0.7769013091219921
degree 6



No. of clusters 85
lambda 1.8047217668271702e-08
loss 0.7770076722946324
degree 6



No. of clusters 85
lambda 2.424462017082331e-08
loss 0.7771635761573756
degree 6



No. of clusters 85
lambda 3.257020655659783e-08
loss 0.7773698748484371
degree 6



No. of clusters 85
lambda 4.37547937507418e-08
loss 0.7776284947356351
degree 6



No. of clusters 85
lambda 5.878016072274912e-08
loss 0.7779417455759106
degree 6



No. of clusters 85
lambda 7.896522868499733e-08
loss 0.7783111412109529
degree 6



No. of clusters 85
lambda 1.0608183551394483e-07
loss 0.7787357123623545
degree 6



No. of clusters 85
lambda 1.4251026703029963e-07
loss 0.7792101984311834
degree 6



No. of clusters 85
lambda 1.9144819761699575e-07
loss 0.7797240171189381
degree 6



No. of clusters 85
lambda 2.571913809059347e-07
loss 0.7802619733173466
degree 6



No. of clusters 85
lambda 3.4551072945922185e-07
loss 0.780806936324763


No. of clusters 85
lambda 2.7283333764867697e-06
loss 0.7792820445242465
degree 8



No. of clusters 85
lambda 3.665241237079626e-06
loss 0.779752385801863
degree 8



No. of clusters 85
lambda 4.923882631706732e-06
loss 0.7802669430306396
degree 8



No. of clusters 85
lambda 6.6147406412301455e-06
loss 0.7808344933739788
degree 8



No. of clusters 85
lambda 8.886238162743407e-06
loss 0.7814659031910586
degree 8



No. of clusters 85
lambda 1.1937766417144358e-05
loss 0.7821727170482431
degree 8



No. of clusters 85
lambda 1.6037187437513277e-05
loss 0.7829641518946466
degree 8



No. of clusters 85
lambda 2.1544346900318823e-05
loss 0.7838433024037057
degree 8



No. of clusters 85
lambda 2.8942661247167517e-05
loss 0.7848043426242859
degree 8



No. of clusters 85
lambda 3.888155180308085e-05
loss 0.7858324803906381
degree 8



No. of clusters 85
lambda 5.223345074266833e-05
loss 0.7869071944254467
degree 8



No. of clusters 85
lambda 7.017038286703822e-05
loss 0.7880078354330448

No. of clusters 95
lambda 0.000554102033000948
loss 0.8126183842114865
degree 5



No. of clusters 95
lambda 0.000744380301325168
loss 0.8147717727548717
degree 5



No. of clusters 95
lambda 0.001
loss 0.8171409588787991
degree 5



No. of clusters 95
lambda 1e-08
loss 0.7766618098807565
degree 6



No. of clusters 95
lambda 1.3433993325988987e-08
loss 0.7767125670600687
degree 6



No. of clusters 95
lambda 1.8047217668271702e-08
loss 0.776811819434006
degree 6



No. of clusters 95
lambda 2.424462017082331e-08
loss 0.7769596474322302
degree 6



No. of clusters 95
lambda 3.257020655659783e-08
loss 0.7771574840526463
degree 6



No. of clusters 95
lambda 4.37547937507418e-08
loss 0.7774078112046207
degree 6



No. of clusters 95
lambda 5.878016072274912e-08
loss 0.7777134703561887
degree 6



No. of clusters 95
lambda 7.896522868499733e-08
loss 0.7780764970105368
degree 6



No. of clusters 95
lambda 1.0608183551394483e-07
loss 0.7784964285060778
degree 6



No. of clusters 95
lambda

No. of clusters 95
lambda 1.1253355826007646e-06
loss 0.7778727068485621
degree 8



No. of clusters 95
lambda 1.51177507061566e-06
loss 0.7782446655796601
degree 8



No. of clusters 95
lambda 2.030917620904735e-06
loss 0.7786449852481825
degree 8



No. of clusters 95
lambda 2.7283333764867697e-06
loss 0.779077072598778
degree 8



No. of clusters 95
lambda 3.665241237079626e-06
loss 0.779545530840166
degree 8



No. of clusters 95
lambda 4.923882631706732e-06
loss 0.7800568283182046
degree 8



No. of clusters 95
lambda 6.6147406412301455e-06
loss 0.7806199677227896
degree 8



No. of clusters 95
lambda 8.886238162743407e-06
loss 0.7812464826486091
degree 8



No. of clusters 95
lambda 1.1937766417144358e-05
loss 0.7819489766580885
degree 8



No. of clusters 95
lambda 1.6037187437513277e-05
loss 0.7827379485370929
degree 8



No. of clusters 95
lambda 2.1544346900318823e-05
loss 0.783617745683497
degree 8



No. of clusters 95
lambda 2.8942661247167517e-05
loss 0.78458348784848
deg

  center = np.matmul(np.transpose(one_hot2, (1, 0)), data) / np.expand_dims(np.sum(one_hot2, axis=0), axis=1)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


No. of clusters 105
lambda 1e-08
loss nan
degree 4



No. of clusters 105
lambda 1.3433993325988987e-08
loss nan
degree 4



No. of clusters 105
lambda 1.8047217668271702e-08
loss nan
degree 4



No. of clusters 105
lambda 2.424462017082331e-08
loss nan
degree 4



No. of clusters 105
lambda 3.257020655659783e-08
loss nan
degree 4



No. of clusters 105
lambda 4.37547937507418e-08
loss nan
degree 4



No. of clusters 105
lambda 5.878016072274912e-08
loss nan
degree 4



No. of clusters 105
lambda 7.896522868499733e-08
loss nan
degree 4



No. of clusters 105
lambda 1.0608183551394483e-07
loss nan
degree 4



No. of clusters 105
lambda 1.4251026703029963e-07
loss nan
degree 4



No. of clusters 105
lambda 1.9144819761699575e-07
loss nan
degree 4



No. of clusters 105
lambda 2.571913809059347e-07
loss nan
degree 4



No. of clusters 105
lambda 3.4551072945922185e-07
loss nan
degree 4



No. of clusters 105
lambda 4.6415888336127725e-07
loss nan
degree 4



No. of clusters 105
lambda 6.2

No. of clusters 105
lambda 0.000744380301325168
loss nan
degree 6



No. of clusters 105
lambda 0.001
loss nan
degree 6



No. of clusters 105
lambda 1e-08
loss nan
degree 7



No. of clusters 105
lambda 1.3433993325988987e-08
loss nan
degree 7



No. of clusters 105
lambda 1.8047217668271702e-08
loss nan
degree 7



No. of clusters 105
lambda 2.424462017082331e-08
loss nan
degree 7



No. of clusters 105
lambda 3.257020655659783e-08
loss nan
degree 7



No. of clusters 105
lambda 4.37547937507418e-08
loss nan
degree 7



No. of clusters 105
lambda 5.878016072274912e-08
loss nan
degree 7



No. of clusters 105
lambda 7.896522868499733e-08
loss nan
degree 7



No. of clusters 105
lambda 1.0608183551394483e-07
loss nan
degree 7



No. of clusters 105
lambda 1.4251026703029963e-07
loss nan
degree 7



No. of clusters 105
lambda 1.9144819761699575e-07
loss nan
degree 7



No. of clusters 105
lambda 2.571913809059347e-07
loss nan
degree 7



No. of clusters 105
lambda 3.4551072945922185e-07

NameError: name 'ls_opt_k' is not defined

## Logistic Regression

In [66]:
print(ls_opt_d, ls_opt_l)
opt_d_0 = 8
opt_d_1 = 8
opt_d_23 = 8

[4] [1e-08]


In [47]:
opt_l_0 = 3.3932217718953295e-07
opt_l_1 = 5.1794746792312124e-08
opt_l_23 = 1e-08

In [54]:
accuracy = []
op_lambda = []

In [57]:
i = 23
tx, opt_d, y_jet, opt_l = tx_23, opt_d_23, y_jet_23, opt_l_23
poly_tx_te = build_poly(tx, opt_d)
quanzhong, _ = ridge_regression(y_jet, poly_tx_te, opt_l)
p = predict_labels(quanzhong, poly_tx_te)
print('Accuracy for jet = '+ str(i))
accuracy.append(np.sum(p==y_jet))
op_lambda.append(opt_l)
print(accuracy, len(y_jet))
print(np.sum(p==y_jet)/len(y_jet))


Accuracy for jet = 23
[82954, 60476, 58330] 72543
0.8040748245867968


In [58]:
#k mean cluster 11
kmc1_acc = accuracy
kmc1_lambda = op_lambda
print(sum(kmc_acc)/250000)
print(kmc1_acc)
print(kmc1_lambda)

0.805564
[82954, 60476, 58330]
[3.3932217718953295e-07, 5.1794746792312124e-08, 1e-08]


In [22]:
#linear Regression

#kmc_acc = accuracy
#kmc_lambda = op_lambda
print(sum(kmc_acc)/250000)
print(kmc_acc)
print(kmc_lambda)

0.805564
[82891, 60519, 57981]
[1.67683293681101e-07, 1e-08, 1e-08]


In [338]:
# meaningless
#regression2_acc = accuracy 
#mei you normalize
#regression2_lambda = op_lambda
print(sum(regression2_acc)/250000)
print(regression2_acc)
print(regression2_lambda)

0.795996
[81711, 59334, 57954]
[0.0007880462815669912, 2.811768697974231e-06, 1.030917620904739e-10]


In [None]:
mean_acc = accuracy
mean_lambda = op_lambda
print(sum(mean_acc)/250000)
print(mean_acc)
print(mean_lambda)

In [129]:
#median_acc = accuracy
#median_lambda = op_lambda
print(sum(median_acc)/250000)
print(median_acc)
print(median_lambda)

0.802208
[81614, 60623, 58315]
[0.0014873521072935117, 0.0014873521072935117, 0.0028072162039411755]


In [118]:
#median2_acc = accuracy 80.2
#median2_lambda = op_lambda
print(sum(median2_acc)/250000)
print(median2_acc)
print(median2_lambda)

0.801596
[81614, 60558, 58227]
[0.0014873521072935117, 0.01, 0.007278953843983146]


## Generate predictions and save ouput in csv format for submission:

In [139]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
# OUTPUT_PATH = 'data/pred.csv' # TODO: fill in desired name of output file for submission
# y_pred = predict_labels(weights, tX_test)
# create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [145]:
t0, t1, t23, t_ids = split_reformat_test(tX_test, ids_test)


print(t0.shape)
print(t1.shape)
print(t23.shape)

(227458, 18)
(175338, 22)
(165442, 29)


In [154]:
len(quanzhong)

111

In [158]:
poly_t1 = build_poly(t1, opt_d)


p = predict_labels(quanzhong, poly_t1)

In [163]:
len(p[p == 1]) / len(p) 

0.1306904378970902

In [164]:
len(y_jet_1[y_jet_1 == 1]) / len(y_jet_1) 

0.35734550706695556