In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
from implementations import *

In [3]:
def standardize(x, mean_x=None, std_x=None):
    """Standardize the original data set."""
    if mean_x is None:
        mean_x = np.nanmean(x, axis=0)
    if std_x is None:
        std_x = np.nanstd(x, axis=0)
    x = x - mean_x
    x = x / std_x
    return x, mean_x, std_x

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

def split_data(x, y, ratio, myseed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(myseed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [4]:
def accuracy(y_pred, y_true):
    return (y_pred == y_true).sum()/len(y_true)

In [5]:
# data path
train_data_path = "../data/train.csv"
test_data_path = "../data/test.csv"

# load training and test set
y_train_data, x_train_data, id_train_data = load_csv_data(train_data_path)
y_test_data, x_test_data, id_test_data = load_csv_data(test_data_path)

In [6]:
long_tail = [0, 1, 2, 3, 5, 8, 9, 10, 13, 16, 19, 21, 23, 26, 29]

In [7]:
jet0_drop = [4, 5, 6, 12, 22, 23, 24, 25, 26, 27, 28, 29]
jet1_drop = [4, 5, 6, 12, 22, 26, 27, 28]
jet2_drop = [22]
jet3_drop = [22]

In [8]:
jet0_index = np.where(x_train_data[:,22]==0)[0]
jet1_index = np.where(x_train_data[:,22]==1)[0]
jet2_index = np.where(x_train_data[:,22]==2)[0]
jet3_index = np.where(x_train_data[:,22]==3)[0]

In [9]:
zero_col = x_train_data[:, 0:1].copy()
other_col = x_train_data[:, 1:].copy()
other_col[other_col == -999] = np.nan

x_train_data = np.hstack((zero_col, other_col))

In [10]:
x_train_data.shape

(250000, 30)

In [11]:
x_train_data[:, long_tail] = np.log1p(x_train_data[:, long_tail])

  """Entry point for launching an IPython kernel.


In [12]:
mean = np.nanmean(x_train_data, 0)
std = np.nanstd(x_train_data, 0)

In [13]:
x_train_data -= mean
x_train_data /= std

In [14]:
x_train_data = np.nan_to_num(x_train_data)

In [15]:
x_jet0 = np.delete(x_train_data[jet0_index, :], jet0_drop, axis=1)
x_jet1 = np.delete(x_train_data[jet1_index, :], jet1_drop, axis=1)
x_jet2 = np.delete(x_train_data[jet2_index, :], jet2_drop, axis=1)
x_jet3 = np.delete(x_train_data[jet3_index, :], jet3_drop, axis=1)

In [16]:
x_jet0.shape, x_jet1.shape, x_jet2.shape, x_jet3.shape

((99913, 18), (77544, 22), (50379, 29), (22164, 29))

In [17]:
y_jet0 = y_train_data[jet0_index]
y_jet1 = y_train_data[jet1_index]
y_jet2 = y_train_data[jet2_index]
y_jet3 = y_train_data[jet3_index]

In [18]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [24]:
tx_jet0 = build_poly(x_jet0, 15)
tx_jet0_mean, tx_jet0_std = tx_jet0.mean(0), tx_jet0.std(0)
tx_jet0 -= tx_jet0_mean
tx_jet0 /= tx_jet0_std
tx_jet0[:, 0] = 1

  after removing the cwd from sys.path.


In [25]:
tx_jet1 = build_poly(x_jet1, 15)
tx_jet1_mean, tx_jet1_std = tx_jet1.mean(0), tx_jet1.std(0)
tx_jet1 -= tx_jet1_mean
tx_jet1 /= tx_jet1_std
tx_jet1[:, 0] = 1

tx_jet2 = build_poly(x_jet2, 15)
tx_jet2_mean, tx_jet2_std = tx_jet2.mean(0), tx_jet2.std(0)
tx_jet2 -= tx_jet2_mean
tx_jet2 /= tx_jet2_std
tx_jet2[:, 0] = 1

tx_jet3 = build_poly(x_jet3, 15)
tx_jet3_mean, tx_jet3_std = tx_jet3.mean(0), tx_jet3.std(0)
tx_jet3 -= tx_jet3_mean
tx_jet3 /= tx_jet3_std
tx_jet3[:, 0] = 1

  after removing the cwd from sys.path.
  # Remove the CWD from sys.path while we load stuff.
  app.launch_new_instance()


## Model

In [26]:
lambda_ = 0.001
w_jet0, _ = ridge_regression(y_jet0, tx_jet0, lambda_)
w_jet1, _ = ridge_regression(y_jet1, tx_jet1, lambda_)
w_jet2, _ = ridge_regression(y_jet2, tx_jet2, lambda_)
w_jet3, _ = ridge_regression(y_jet3, tx_jet3, lambda_)

In [27]:
y_pred = np.zeros_like(y_train_data)
y_pred[jet0_index] = predict_labels(w_jet0, tx_jet0)
y_pred[jet1_index] = predict_labels(w_jet1, tx_jet1)
y_pred[jet2_index] = predict_labels(w_jet2, tx_jet2)
y_pred[jet3_index] = predict_labels(w_jet3, tx_jet3)

In [28]:
accuracy(y_pred, y_train_data)

0.82316800000000001

## Predict

In [29]:
jet0_index = np.where(x_test_data[:,22]==0)[0]
jet1_index = np.where(x_test_data[:,22]==1)[0]
jet2_index = np.where(x_test_data[:,22]==2)[0]
jet3_index = np.where(x_test_data[:,22]==3)[0]

# x_test_data[x_test_data==-999] = np.nan

zero_col = x_test_data[:, 0:1].copy()
other_col = x_test_data[:, 1:].copy()
other_col[other_col == -999] = np.nan

x_test_data = np.hstack((zero_col, other_col))

x_test_data[:, long_tail] = np.log1p(x_test_data[:, long_tail])

x_test_data -= mean
x_test_data /= std

x_test_data = np.nan_to_num(x_test_data)

x_jet0 = np.delete(x_test_data[jet0_index, :], jet0_drop, axis=1)
x_jet1 = np.delete(x_test_data[jet1_index, :], jet1_drop, axis=1)
x_jet2 = np.delete(x_test_data[jet2_index, :], jet2_drop, axis=1)
x_jet3 = np.delete(x_test_data[jet3_index, :], jet3_drop, axis=1)

tx_jet0 = build_poly(x_jet0, 15)
tx_jet1 = build_poly(x_jet1, 15)
tx_jet2 = build_poly(x_jet2, 15)
tx_jet3 = build_poly(x_jet3, 15)

tx_jet0 -= tx_jet0_mean
tx_jet0 /= tx_jet0_std
tx_jet0[:, 0] = 1

tx_jet1 -= tx_jet1_mean
tx_jet1 /= tx_jet1_std
tx_jet1[:, 0] = 1

tx_jet2 -= tx_jet2_mean
tx_jet2 /= tx_jet2_std
tx_jet2[:, 0] = 1

tx_jet3 -= tx_jet3_mean
tx_jet3 /= tx_jet3_std
tx_jet3[:, 0] = 1

y_pred = np.zeros(len(x_test_data))
y_pred[jet0_index] = predict_labels(w_jet0, tx_jet0)
y_pred[jet1_index] = predict_labels(w_jet1, tx_jet1)
y_pred[jet2_index] = predict_labels(w_jet2, tx_jet2)
y_pred[jet3_index] = predict_labels(w_jet3, tx_jet3)

  


In [30]:
create_csv_submission(id_test_data, y_pred, "OCT18_norm_after_poly")