In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
from implementations import *

In [39]:
def standardize(x, mean_x=None, std_x=None):
    """Standardize the original data set."""
    if mean_x is None:
        mean_x = np.nanmean(x, axis=0)
    if std_x is None:
        std_x = np.nanstd(x, axis=0)
    x = x - mean_x
    x = x / std_x
    return x, mean_x, std_x

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

def split_data(x, y, ratio, myseed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(myseed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [12]:
def accuracy(y_pred, y_true):
    return (y_pred == y_true).sum()/len(y_true)

In [3]:
def tozeroone(y):
    """[-1, 1] -> [0, 1]"""
    return (y + 1) / 2
def tonegposone(y):
    """[0, 1] -> [-1, 1]"""
    return y * 2 - 1

In [131]:
# data path
train_data_path = "../data/train.csv"
test_data_path = "../data/test.csv"

# load training and test set
y_train_data, x_train_data, id_train_data = load_csv_data(train_data_path)
y_test_data, x_test_data, id_test_data = load_csv_data(test_data_path)

# number of samples
N = x_tr.shape[0]
# number of features
m = x_tr.shape[1]

In [132]:
N, m

(200000, 30)

## Data Exploration

In [143]:
import pandas as pd

In [146]:
df_train = pd.read_csv(train_data_path, index_col='Id')
df_train = df_train[df_train.columns[1:]]

In [151]:
feature_name = df_train.columns
feature_name

Index(['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis',
       'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
       'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt',
       'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality',
       'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi',
       'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met', 'PRI_met_phi',
       'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt',
       'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt',
       'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt'],
      dtype='object')

In [158]:
df_train = df_train.replace(-999, np.nan)

In [165]:
df_train[df_train["PRI_jet_num"]==0].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DER_mass_MMC,73790.0,120.667654,51.750062,9.044,90.241,111.452,137.3885,863.647
DER_mass_transverse_met_lep,99913.0,58.786239,32.003551,0.002,35.062,62.144,79.65,570.115
DER_mass_vis,99913.0,81.870309,38.043669,7.12,60.947,75.533,94.135,1349.351
DER_pt_h,99913.0,13.823867,16.674707,0.0,2.275,6.677,24.202,2834.999
DER_deltaeta_jet_jet,0.0,,,,,,,
DER_mass_jet_jet,0.0,,,,,,,
DER_prodeta_jet_jet,0.0,,,,,,,
DER_deltar_tau_lep,99913.0,2.664961,0.693295,0.277,2.338,2.822,3.095,5.684
DER_pt_tot,99913.0,13.823867,16.674707,0.0,2.275,6.677,24.202,2834.999
DER_sum_pt,99913.0,76.377011,23.561057,46.104,61.297,71.586,85.242,1324.678


In [166]:
df_train[df_train["PRI_jet_num"]==1].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DER_mass_MMC,69982.0,122.182109,59.344924,9.806,92.15325,112.4055,136.13175,1192.026
DER_mass_transverse_met_lep,77544.0,46.0536,35.58119,0.0,16.2675,40.4985,70.013,571.868
DER_mass_vis,77544.0,82.219033,42.288122,6.329,59.80975,73.936,92.67975,959.601
DER_pt_h,77544.0,65.90309,47.431007,0.0,37.131,53.081,79.002,753.745
DER_deltaeta_jet_jet,0.0,,,,,,,
DER_mass_jet_jet,0.0,,,,,,,
DER_prodeta_jet_jet,0.0,,,,,,,
DER_deltar_tau_lep,77544.0,2.339686,0.737633,0.208,1.855,2.404,2.855,5.655
DER_pt_tot,77544.0,16.645,17.007924,0.0,2.895,10.705,26.21425,330.527
DER_sum_pt,77544.0,150.368035,65.182087,77.017,110.75275,132.3265,167.624,1214.932


In [167]:
df_train[df_train["PRI_jet_num"]==2].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DER_mass_MMC,47427.0,122.653136,55.74963,9.878,95.696,115.254,133.742,966.986
DER_mass_transverse_met_lep,50379.0,38.343612,35.273944,0.0,11.6055,28.501,56.512,595.819
DER_mass_vis,50379.0,79.213348,39.917128,7.33,58.525,72.27,89.5055,1051.358
DER_pt_h,50379.0,102.985028,70.529284,0.12,54.0655,87.682,134.1345,1053.807
DER_deltaeta_jet_jet,50379.0,2.606534,1.812621,0.0,0.984,2.383,4.004,8.503
DER_mass_jet_jet,50379.0,391.40645,426.852418,13.602,106.676,228.643,513.6745,4974.979
DER_prodeta_jet_jet,50379.0,-1.115458,3.821072,-18.066,-3.251,-0.465,0.869,16.69
DER_deltar_tau_lep,50379.0,2.060607,0.759354,0.228,1.485,2.02,2.632,5.579
DER_pt_tot,50379.0,17.280603,20.409682,0.004,2.7745,9.569,25.967,513.659
DER_sum_pt,50379.0,245.776275,101.356966,110.601,177.702,218.683,283.0885,1282.523


In [169]:
df_train[df_train["PRI_jet_num"]==3].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DER_mass_MMC,20687.0,123.18999,70.899464,15.731,88.888,107.534,132.362,988.199
DER_mass_transverse_met_lep,22164.0,42.120203,38.075038,0.0,13.42575,32.165,62.188,690.075
DER_mass_vis,22164.0,78.925522,48.790879,10.342,54.03725,67.991,87.76325,1329.913
DER_pt_h,22164.0,126.066343,83.821859,0.777,65.49775,108.909,166.42025,762.806
DER_deltaeta_jet_jet,22164.0,1.94277,1.469926,0.0,0.714,1.6345,2.877,7.877
DER_mass_jet_jet,22164.0,327.179877,317.279844,17.084,123.78975,221.3345,411.927,4062.147
DER_prodeta_jet_jet,22164.0,-0.153947,2.867334,-15.347,-1.34525,-0.015,1.11925,14.772
DER_deltar_tau_lep,22164.0,1.88462,0.82726,0.379,1.23,1.767,2.491,5.505
DER_pt_tot,22164.0,53.548536,32.299212,0.04,33.20825,45.6945,66.12125,466.525
DER_sum_pt,22164.0,358.00831,150.138383,145.486,254.9965,320.584,417.14875,1852.462


## Preprocessing

In [231]:
def preprocessing(input_data):
    data = input_data.copy()
    # replace -999 with nan
    data[data == -999] = np.nan

    # replace nan with the most frequent elemet
    for i in range(data.shape[1]):
        data_i = data[:, i]
        u, counts = np.unique(data_i, return_counts=True)
        most_freq = u[np.argmax(counts)]
        data_i[data_i!=data_i] = most_freq

    # get mean and std
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)

    # normalize
    data = data - mean
    data = data / std
    
    return data

In [232]:
data = preprocessing(x_train_data)

In [233]:
data

array([[ 0.3827365 ,  0.06833197,  0.40768027, ...,  1.83381567,
        -0.69661581,  0.4125105 ],
       [ 0.80239171,  0.55250482,  0.54013641, ..., -0.36652212,
        -0.4496554 , -0.27381996],
       [-0.40279783,  3.19515553,  1.09655998, ..., -0.36652212,
        -0.4496554 , -0.29396985],
       ..., 
       [-0.23390468,  0.31931645, -0.13086367, ..., -0.36652212,
        -0.4496554 , -0.31701723],
       [-0.4301435 , -0.84532397, -0.30297338, ..., -0.36652212,
        -0.4496554 , -0.74543941],
       [-0.40279783,  0.66533608, -0.25352276, ..., -0.36652212,
        -0.4496554 , -0.74543941]])

In [230]:
data.mean(0)

array([  1.82705611e-12,   4.49575133e-15,  -3.48448848e-15,
         7.18646387e-15,  -4.54941415e-13,  -1.32736188e-12,
        -3.61011410e-15,   2.16223188e-14,   6.40057962e-15,
         2.86143687e-15,  -6.98486646e-15,   3.63458152e-15,
        -1.02319619e-15,  -5.95722149e-15,   1.35646161e-16,
         7.13136217e-17,   2.58023760e-14,  -1.06327391e-16,
        -1.87188487e-16,   8.24115935e-15,   1.41040513e-16,
        -8.99509711e-15,  -6.01698247e-16,   7.06883188e-13,
         9.08278244e-14,  -1.05442427e-13,   8.84288113e-13,
        -7.17539436e-13,   2.75050842e-12,  -1.77122317e-15])

In [206]:
data[:, 0][:4]

array([ 0.13631768,  0.32068721,  3.        ,  0.18091858])

In [191]:
u[np.argmax(counts)-4:np.argmax(counts)+4]

array([ 96.411,  96.412,  96.413,  96.414,  96.415,  96.416,  96.417,
        96.418])

## Create train and validataion set

In [42]:
x_tr, x_va, y_tr, y_va = split_data(x_tr, y_tr, 0.8)

In [43]:
x_tr.shape[0], x_va.shape[0]

(200000, 50000)

## Preprocessing + Feature Engineering

In [44]:
# TODO: more sophisticated data preprocessing
# replace missing data with nan
x_tr[x_tr == -999] = np.nan
x_va[x_va == -999] = np.nan
x_te[x_te == -999] = np.nan
# normalize
x_tr, mean, std = standardize(x_tr)
x_va, _, _ = standardize(x_va, mean, std)
x_te, _, _ = standardize(x_te, mean, std)
# fill nan
x_tr = np.nan_to_num(x_tr)
x_va = np.nan_to_num(x_va)
x_te = np.nan_to_num(x_te)

## Ridge Regression

In [97]:
lambda_ = 0.001

w, loss = ridge_regression(y_tr, x_tr, lambda_)

y_pred = predict_labels(w, x_va)

accuracy(y_pred, y_va)

0.71636

## Logistic Regression

### L2 Normalization and SGD

In [124]:
# initialize weights
initial_w = np.random.rand(m)

# TODO: cross validataion
# hyperparameters
max_iters = 100
gamma = 0.00001
lambda_ = 0.1

In [130]:
# run regression
y_tr_log = tozeroone(y_tr)
# initialize weight
w = initial_w

losses = []
for n_iter in range(max_iters):
    # compute a gradient
    grad = compute_gradient_reg_logistic(y_tr_log, x_tr, w, lambda_)
    # update w through the gradient update
    w = w - gamma * grad
    # calculate loss
    loss = compute_nl_loss_regularization(y_tr_log, x_tr, w, lambda_)
    losses.append(loss)

plt.plot(losses)

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [94]:


y_pred = predict_labels(w, x_va)

accuracy(y_pred, y_va)

0.68522000000000005

### L2 Normalization and GD

### L1 Normaliziation and SGD

## Create submission

In [22]:
# predict
y_pred = predict_labels(weights, x_te)

# create submission
name = "submission.csv"
create_csv_submission(id_te, y_pred, name)