In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
# Import files to use in preprocessing and machine learning
from implementations import *
from proj1_helpers import *
from preprocess import *
from cross_validation import *

## Load the training data into feature matrix, class labels, and event ids:

In [46]:
# Download train data and supply path here 
DATA_TRAIN_PATH = '../data/train.csv' 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [4]:
# Check the array shape of y, tX, and ids
print(y.shape)
print(tX.shape)
print(ids.shape)

(250000,)
(250000, 30)
(250000,)


# Initial Data Analysis

In observing the original training data, we found out that there exists missing data all over tX. The missing data are represented as value -999. Considering that these columns are critical in model training, we cannot simply delete these rows with -999 values. Therefore, we need to process the original training set before model training.

Firstly, we check the columns of tX to obtain an overview of missing data:

In [5]:
# Check whether the missing values are associated with the classification result
for col in range(tX.shape[1]):
    tX_T = np.transpose(tX)
    
    null = (tX_T[col] == -999)
    null_s = np.logical_and(y >= 0, null)
    null_b = np.logical_and(y < 0, null)
    
    tX_null = tX[null]
    tX_null_s = tX[null_s]
    tX_null_b = tX[null_b]
    
    if (tX_null.shape[0] > 0):
        # Print the percentage of column 'col' having a -999 (missing) value
        print('Column', col, 'has {}% percentage of missing values'.format(tX_null.shape[0] * 100 / tX.shape[0]))

        # Print the conditional probability of P(y = 1|x having -999)
        print('P(y = 1|x having -999) = {:.3f}%'.format(tX_null_s.shape[0] * 100 / tX_null.shape[0]))
        
        # Print the conditional probability of P(y = -1|x having -999)
        print('P(y = -1|x having -999) = {:.3f}% \n'.format(tX_null_b.shape[0] * 100 / tX_null.shape[0]))

Column 0 has 15.2456% percentage of missing values
P(y = 1|x having -999) = 7.438%
P(y = -1|x having -999) = 92.562% 

Column 4 has 70.9828% percentage of missing values
P(y = 1|x having -999) = 29.980%
P(y = -1|x having -999) = 70.020% 

Column 5 has 70.9828% percentage of missing values
P(y = 1|x having -999) = 29.980%
P(y = -1|x having -999) = 70.020% 

Column 6 has 70.9828% percentage of missing values
P(y = 1|x having -999) = 29.980%
P(y = -1|x having -999) = 70.020% 

Column 12 has 70.9828% percentage of missing values
P(y = 1|x having -999) = 29.980%
P(y = -1|x having -999) = 70.020% 

Column 23 has 39.9652% percentage of missing values
P(y = 1|x having -999) = 25.514%
P(y = -1|x having -999) = 74.486% 

Column 24 has 39.9652% percentage of missing values
P(y = 1|x having -999) = 25.514%
P(y = -1|x having -999) = 74.486% 

Column 25 has 39.9652% percentage of missing values
P(y = 1|x having -999) = 25.514%
P(y = -1|x having -999) = 74.486% 

Column 26 has 70.9828% percentage of 

We can see that 11 columns contains at least one -999 (missing value). Now we check whether some of the missing values are dependent on the column named 'PRI_jet_num' (column No. 23), since 'PRI_jet_num' has a discrete value range {0, 1, 2, 3} and our observation on the beginning data rows showed a dependency of some missing values to the value of 'PRI_jet_num' column.

In [6]:
PRI_jet_range = [i for i in range(0, 4)]
PRI_jet_sum = []
PRI_jet_null = []

for value in PRI_jet_range:
    tX_PRI = tX[tX[:, 22] == value]
    
    # Append values of row numbers for different PRI_jet_num, finally sum up to see whether it equals to the length of tX
    PRI_jet_sum.append(len(tX_PRI))
    
    # Count the number of missing columns corresponding to different PRI_jet_num values
    PRI_jet_keys = []
    for i in range (len(tX_PRI)):
        tX_null_cols = np.count_nonzero(tX_PRI[i] == -999, axis = 0)
        PRI_jet_keys.append(tX_null_cols)
    
    PRI_jet_null.append(list(set(PRI_jet_keys)))

    
print("Sum of rows for different PRI_jet_num: {} \n".format(sum(PRI_jet_sum)))

for i in range(4):
    print("PRI_jet_num =", PRI_jet_range[i], "No. of columns having -999 (a missing value):{}".format(PRI_jet_null[i]))

Sum of rows for different PRI_jet_num: 250000 

PRI_jet_num = 0 No. of columns having -999 (a missing value):[10, 11]
PRI_jet_num = 1 No. of columns having -999 (a missing value):[8, 7]
PRI_jet_num = 2 No. of columns having -999 (a missing value):[0, 1]
PRI_jet_num = 3 No. of columns having -999 (a missing value):[0, 1]


The above analysis showed that one column with -999 (missing value) is independent of the column 'PRI_jet_num', we check the original training set and we can easily find out that the first tX column 'DER_mass_MMC' is independent of 'PRI_jet_num'. 

# Data Preprocessing

Based on the data analysis above, we conduct the following method to pre-process the training data. 

# Model Training

## Least Squares

## Ridge Regression

In [12]:
# Set H-parameters
K_FOLD = 10
DEGREE = np.arange(1, 8)
SEED = 5
LAMBDA = np.logspace(-6, -2, 30)

In [13]:
def find_optimal(x, y, degrees, k_fold, lambdas, seed=1):
    # Split the data into k-fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # Set lists for collecting best lambda & rmse for each degree
    best_lambda = []
    best_rmse = []
    
    for degree in degrees:
        rmse_val = []
        
        for lambda_ in lambdas:
            rmse_val_lambda_ = []
            
            for k in range(k_fold):
                _, loss_val, w = cross_validation(y, x, k_indices, k, lambda_, degree)
                rmse_val_lambda_.append(loss_val)
                
            print("lambda {}".format(lambda_))
            print("loss {}".format(np.mean(rmse_val_lambda_)))
            print("degree {}".format(degree))
            print("\n\n")

            rmse_val.append(np.mean(rmse_val_lambda_))
        
        index_opt_lambda = np.argmin(rmse_val)
        best_lambda.append(lambdas[index_opt_lambda])
        best_rmse.append(rmse_val[index_opt_lambda])
    
    opt_degree = degrees[np.argmin(best_rmse)]
    opt_lambda = best_lambda[np.argmin(best_rmse)]
    
    return opt_degree, opt_lambda

In [48]:
def train_models(feature, label, degrees, k_fold, lambdas):
    opt_degree, opt_lambda = [], []
    
    feature_arr = split_reformat_feature(feature)
    label_arr = split_label(feature, label)
    
    # Parallel iteration for cross validation to select the best degree of complexity and learning rate
    # Train
    for f, l in zip(feature_arr, label_arr):
        opt_d, opt_l = find_optimal(f, l, degrees, k_fold, lambdas)
        opt_degree.append(opt_d)
        opt_lambda.append(opt_l)
        
    return opt_degree, opt_lambda

## Logistic Regression

## Generate predictions and save ouput in csv format for submission:

In [33]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [50]:
best_degree, best_lambda = train_models(tX, y, DEGREE, K_FOLD, LAMBDA)


lambda 1e-06
loss 0.8718895832839373
degree 1



lambda 1.3738237958832638e-06
loss 0.8718895832886332
degree 1



lambda 1.8873918221350957e-06
loss 0.8718895832953357
degree 1



lambda 2.592943797404667e-06
loss 0.8718895833050165
degree 1



lambda 3.562247890262444e-06
loss 0.8718895833192093
degree 1



lambda 4.893900918477499e-06
loss 0.8718895833403929
degree 1



lambda 6.723357536499335e-06
loss 0.8718895833726762
degree 1



lambda 9.236708571873865e-06
loss 0.8718895834230311
degree 1



lambda 1.2689610031679234e-05
loss 0.8718895835035397
degree 1



lambda 1.7433288221999873e-05
loss 0.8718895836355284
degree 1



lambda 2.395026619987486e-05
loss 0.8718895838572136
degree 1



lambda 3.290344562312671e-05
loss 0.8718895842379328
degree 1



lambda 4.520353656360241e-05
loss 0.8718895849047067
degree 1



lambda 6.210169418915616e-05
loss 0.871889586091975
degree 1



lambda 8.531678524172815e-05
loss 0.8718895882348981
degree 1



lambda 0.00011721022975334806
loss 0.8

KeyboardInterrupt: 

In [31]:
# OUTPUT_PATH = 'data/pred.csv' # TODO: fill in desired name of output file for submission
# y_pred = predict_labels(weights, tX_test)
# create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [145]:
t0, t1, t23, t_ids = split_reformat_test(tX_test, ids_test)


print(t0.shape)
print(t1.shape)
print(t23.shape)

(227458, 18)
(175338, 22)
(165442, 29)


In [158]:
poly_t1 = build_poly(t1, opt_d)


p = predict_labels(quanzhong, poly_t1)

In [163]:
len(p[p == 1]) / len(p) 

0.1306904378970902

In [164]:
len(y_jet_1[y_jet_1 == 1]) / len(y_jet_1) 

0.35734550706695556