In [1]:
# Imports
import numpy as np
import pandas as pd
import os
from utilities import *
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import gc
import itertools
%matplotlib inline

gc.enable()

train = pd.read_csv('training_set.csv')
train['flux_ratio_sq'] = np.power(train['flux'] / train['flux_err'], 2.0)
train['flux_by_flux_ratio_sq'] = train['flux'] * train['flux_ratio_sq']
train_det = train.where(train['detected'] == 1)

aggs = {
    'flux': ['min', 'max', 'mean', 'median', 'std','skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std','skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum','skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}

aggs_det = {
    'mjd': ['min', 'max', 'size'],
    'passband': ['mean', 'std', 'var'],
}

agg_train = train.groupby('object_id').agg(aggs)
agg_train_det = train_det.groupby('object_id').agg(aggs_det)

new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
new_columns_det = [
    k + '_' + agg for k in aggs_det.keys() for agg in aggs_det[k]
]

agg_train.columns = new_columns
agg_train['flux_diff'] = agg_train['flux_max'] - agg_train['flux_min']
agg_train['flux_dif2'] = (agg_train['flux_max'] - agg_train['flux_min']) / agg_train['flux_mean']
agg_train['flux_w_mean'] = agg_train['flux_by_flux_ratio_sq_sum'] / agg_train['flux_ratio_sq_sum']
agg_train['flux_dif3'] = (agg_train['flux_max'] - agg_train['flux_min']) / agg_train['flux_w_mean']

agg_train_det.columns = new_columns_det
agg_train_det['mjd_diff'] = agg_train_det['mjd_max'] - agg_train_det['mjd_min']

agg_train_det.columns = agg_train_det.columns + "_det"

del agg_train_det['mjd_max_det'], agg_train_det['mjd_min_det']

agg_train = pd.concat([agg_train, agg_train_det], axis=1, join='inner')

del train, train_det, agg_train_det
gc.collect()

agg_train = agg_train.values

In [2]:
X_feats = pd.read_csv('training_set_metadata_head.csv')
X_feats = X_feats.iloc[:,np.array([7, 8, 9, 10])]
X_feats['distmod'].fillna(0, inplace=True)
X_feats = X_feats.values

X_period = pd.read_csv('F:\\Documents\\Kaggle\\Results2\\lc_period.csv', header=None).values
X_power = pd.read_csv('F:\\Documents\\Kaggle\\Results2\\lc_power.csv', header=None).values
X_range = pd.read_csv('F:\\Documents\\Kaggle\\Results2\\lc_range.csv', header=None).values
X_skew = pd.read_csv('F:\\Documents\\Kaggle\\Results2\\lc_skewness.csv', header=None).values

X_feats = np.concatenate((X_feats, X_period, X_power, X_range, X_skew, agg_train), axis = 1)

X_all, labels_all = read_data_all2(data_path="F:\\Documents\\Kaggle\\PLAsTiCC")
X_all_p, labels_all_p = read_data_all2_p(data_path="F:\\Documents\\Kaggle\\PLAsTiCC")

  X[:,:,i_ch] = dat_.as_matrix()
100%|██████████████████████████████████████████████████████████████████████| 6/6 [00:31<00:00,  5.21s/it]
  X[:,:,i_ch] = dat_.as_matrix()
100%|██████████████████████████████████████████████████████████████████████| 6/6 [00:14<00:00,  2.40s/it]


In [3]:
# Normalize?
X_all[:,1:1999,:] = standardize_full(X_all[:,1:1999,:])
X_all_p = standardize_full(X_all_p)
X_feats = standardize_feats_full(X_feats)

In [4]:
X_train, X_test, labels_train, labels_test = train_test_split(X_all, labels_all,
                                                stratify = labels_all, random_state = 111, test_size = 0.2)

X_train_feats, X_test_feats, _ , _ = train_test_split(X_feats, labels_all,
                                                stratify = labels_all, random_state = 111, test_size = 0.2)

X_train_p, X_test_p, _ , _ = train_test_split(X_all_p, labels_all,
                                                stratify = labels_all, random_state = 111, test_size = 0.2)

X_tr, X_vld, lab_tr, lab_vld = train_test_split(X_train, labels_train, 
                                                stratify = labels_train, random_state = 222, test_size = 0.2)

X_tr_f, X_vld_f, lab_tr_f, lab_vld_f = train_test_split(X_train_feats, labels_train, 
                                                stratify = labels_train, random_state = 222, test_size = 0.2)

X_tr_p, X_vld_p, lab_tr_p, lab_vld_p = train_test_split(X_train_p, labels_train, 
                                                stratify = labels_train, random_state = 222, test_size = 0.2)

X_tr_f

array([[-0.65604672, -0.51838063, -1.53349927, ...,  1.43768042,
         1.60602404,  1.3282128 ],
       [-0.65604672, -0.51838063, -1.53349927, ...,  0.17531013,
        -0.01337359, -0.62363898],
       [-0.65604672, -0.51838063, -1.53349927, ..., -0.02197492,
        -0.21165453,  1.61488196],
       ...,
       [ 4.07982946,  3.38275786,  0.92914195, ...,  0.67226942,
         0.55175057, -0.66359923],
       [ 0.30872651, -0.03760535,  0.7052746 , ...,  0.37909655,
         0.20699545, -0.71820108],
       [-0.65604672, -0.51838063, -1.53349927, ...,  1.00470792,
         0.98225836,  1.73802519]])

In [5]:
y_tr = one_hot(lab_tr)
y_vld = one_hot(lab_vld)
y_test = one_hot(labels_test)

X_tr_f

array([[-0.65604672, -0.51838063, -1.53349927, ...,  1.43768042,
         1.60602404,  1.3282128 ],
       [-0.65604672, -0.51838063, -1.53349927, ...,  0.17531013,
        -0.01337359, -0.62363898],
       [-0.65604672, -0.51838063, -1.53349927, ..., -0.02197492,
        -0.21165453,  1.61488196],
       ...,
       [ 4.07982946,  3.38275786,  0.92914195, ...,  0.67226942,
         0.55175057, -0.66359923],
       [ 0.30872651, -0.03760535,  0.7052746 , ...,  0.37909655,
         0.20699545, -0.71820108],
       [-0.65604672, -0.51838063, -1.53349927, ...,  1.00470792,
         0.98225836,  1.73802519]])

In [6]:
# Imports
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [7]:
batch_size = 100       # Batch size
seq_len = 2000          # Number of steps
learning_rate = 0.00002
epochs = 1000

n_classes = 14
n_channels = 6

In [8]:
graph = tf.Graph()

# Construct placeholders
with graph.as_default():
    inputs_ = tf.placeholder(tf.float32, [None, seq_len, n_channels], name = 'inputs')
    inputs2_ = tf.placeholder(tf.float32, [None, 34], name = 'inputs2')
    inputs3_ = tf.placeholder(tf.float32, [None, seq_len/2, n_channels], name = 'inputs')
    labels_ = tf.placeholder(tf.float32, [None, n_classes], name = 'labels')
    keep_prob_ = tf.placeholder(tf.float32, name = 'keep')
    learning_rate_ = tf.placeholder(tf.float32, name = 'learning_rate')

In [9]:
with graph.as_default():
    # (batch, 128, 9) --> (batch, 64, 18)
    conv1 = tf.layers.conv1d(inputs=inputs_, filters=32, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_1 = tf.layers.max_pooling1d(inputs=conv1, pool_size=20, strides=2, padding='same')
    
    # (batch, 64, 18) --> (batch, 32, 36)
    conv2 = tf.layers.conv1d(inputs=max_pool_1, filters=32, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_2 = tf.layers.max_pooling1d(inputs=conv2, pool_size=16, strides=2, padding='same')
    
    # (batch, 32, 36) --> (batch, 16, 72)
    conv3 = tf.layers.conv1d(inputs=max_pool_2, filters=64, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_3 = tf.layers.max_pooling1d(inputs=conv3, pool_size=12, strides=2, padding='same')
    
    # (batch, 16, 72) --> (batch, 8, 144)
    conv4 = tf.layers.conv1d(inputs=max_pool_3, filters=64, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_4 = tf.layers.max_pooling1d(inputs=conv4, pool_size=8, strides=2, padding='same')
    
with graph.as_default():
    # (batch, 128, 9) --> (batch, 64, 18)
    conv1_p = tf.layers.conv1d(inputs=inputs3_, filters=32, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_1_p = tf.layers.max_pooling1d(inputs=conv1_p, pool_size=10, strides=2, padding='same')
    
    # (batch, 64, 18) --> (batch, 32, 36)
    conv2_p = tf.layers.conv1d(inputs=max_pool_1_p, filters=32, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_2_p = tf.layers.max_pooling1d(inputs=conv2_p, pool_size=8, strides=2, padding='same')
    
    # (batch, 32, 36) --> (batch, 16, 72)
    conv3_p = tf.layers.conv1d(inputs=max_pool_2_p, filters=64, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_3_p = tf.layers.max_pooling1d(inputs=conv3_p, pool_size=6, strides=2, padding='same')
    
    # (batch, 16, 72) --> (batch, 8, 144)
    conv4_p = tf.layers.conv1d(inputs=max_pool_3_p, filters=64, kernel_size=10, strides=2, 
                             padding='same', activation = tf.nn.relu)
    max_pool_4_p = tf.layers.max_pooling1d(inputs=conv4_p, pool_size=4, strides=2, padding='same')
    
print(max_pool_4.get_shape().as_list())
print(max_pool_4_p.get_shape().as_list())

[None, 8, 64]
[None, 4, 64]


In [10]:
with graph.as_default():
    # Flatten and add dropout
    nnfeats = tf.layers.dense(inputs=inputs2_, units=512, activation=tf.nn.tanh)
    nnfeats2 = tf.nn.dropout(nnfeats, keep_prob=keep_prob_)
    nnfeats3 = tf.layers.dense(inputs=nnfeats2, units=256, activation=tf.nn.tanh)
    nnfeats4 = tf.nn.dropout(nnfeats3, keep_prob=keep_prob_)
    nnfeats5 = tf.layers.dense(inputs=nnfeats4, units=128, activation=tf.nn.tanh)
    nnfeats6 = tf.nn.dropout(nnfeats5, keep_prob=keep_prob_)
    nnfeats7 = tf.layers.dense(inputs=nnfeats6, units=64, activation=tf.nn.tanh)
    nnfeats8 = tf.nn.dropout(nnfeats7, keep_prob=keep_prob_)
    
    flat = tf.concat([tf.reshape(max_pool_4, (-1, 8*64)), nnfeats8, tf.reshape(max_pool_4_p, (-1, 4*64))], 1)
    flat2 = tf.nn.dropout(flat, keep_prob=keep_prob_)
    flat3 = tf.layers.dense(inputs=flat2, units=512, activation=tf.nn.tanh)
    flat4 = tf.nn.dropout(flat3, keep_prob=keep_prob_)
    
    # Predictions
    logits = tf.layers.dense(flat4, n_classes)
    
    # Cost function and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_))
    optimizer = tf.train.AdamOptimizer(learning_rate_).minimize(cost)
    
    # Accuracy
    correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(labels_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy')
    
print(flat.get_shape().as_list())

[None, 832]


In [11]:
if (os.path.exists('checkpoints-cnn') == False):
    !mkdir checkpoints-cnn

In [12]:
validation_acc = []
validation_loss = []

train_acc = []
train_loss = []

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
   
    # Loop over epochs
    for e in range(epochs):
        
        # Loop over batches
        for x,y,x_f,x_p in get_batches(X_tr, y_tr, X_tr_f, X_tr_p, batch_size):
            
            # Feed dictionary
            feed = {inputs_ : x, labels_ : y, inputs2_ : x_f, inputs3_ : x_p, keep_prob_ : 0.5, learning_rate_ : learning_rate}
            
            # Loss
            loss, _ , acc = sess.run([cost, optimizer, accuracy], feed_dict = feed)
            train_acc.append(acc)
            train_loss.append(loss)
            
            # Print at each 5 iters
            if (iteration % 5 == 0):
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {:d}".format(iteration),
                      "Train loss: {:6f}".format(loss),
                      "Train acc: {:.6f}".format(acc))
            
            # Compute validation loss at every 10 iterations
            if (iteration%10 == 0):                
                val_acc_ = []
                val_loss_ = []
                
                for x_v, y_v, x_v_f, x_v_p in get_batches(X_vld, y_vld, X_vld_f, X_vld_p, batch_size):
                    # Feed
                    feed = {inputs_ : x_v, inputs2_ : x_v_f, inputs3_ : x_v_p, labels_ : y_v, keep_prob_ : 1.0}  
                    
                    # Loss
                    loss_v, acc_v = sess.run([cost, accuracy], feed_dict = feed)                    
                    val_acc_.append(acc_v)
                    val_loss_.append(loss_v)
                
                # Print info
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {:d}".format(iteration),
                      "Validation loss: {:6f}".format(np.mean(val_loss_)),
                      "Validation acc: {:.6f}".format(np.mean(val_acc_)))
                
                # Store
                validation_acc.append(np.mean(val_acc_))
                validation_loss.append(np.mean(val_loss_))
            
            # Iterate 
            iteration += 1
    
    saver.save(sess,"checkpoints-cnn/har.ckpt")

Epoch: 0/600 Iteration: 5 Train loss: 2.775682 Train acc: 0.100000
Epoch: 0/600 Iteration: 10 Train loss: 2.766908 Train acc: 0.110000
Epoch: 0/600 Iteration: 10 Validation loss: 2.604243 Validation acc: 0.070833
Epoch: 0/600 Iteration: 15 Train loss: 2.777820 Train acc: 0.110000
Epoch: 0/600 Iteration: 20 Train loss: 2.659730 Train acc: 0.140000
Epoch: 0/600 Iteration: 20 Validation loss: 2.573689 Validation acc: 0.088333
Epoch: 0/600 Iteration: 25 Train loss: 2.723391 Train acc: 0.080000
Epoch: 0/600 Iteration: 30 Train loss: 2.765467 Train acc: 0.070000
Epoch: 0/600 Iteration: 30 Validation loss: 2.545670 Validation acc: 0.115833
Epoch: 0/600 Iteration: 35 Train loss: 2.665018 Train acc: 0.100000
Epoch: 0/600 Iteration: 40 Train loss: 2.798372 Train acc: 0.130000
Epoch: 0/600 Iteration: 40 Validation loss: 2.519348 Validation acc: 0.162500
Epoch: 0/600 Iteration: 45 Train loss: 2.684702 Train acc: 0.100000
Epoch: 0/600 Iteration: 50 Train loss: 2.760364 Train acc: 0.090000
Epoch: 0/

Epoch: 7/600 Iteration: 380 Validation loss: 1.709171 Validation acc: 0.543333
Epoch: 7/600 Iteration: 385 Train loss: 2.065768 Train acc: 0.360000
Epoch: 7/600 Iteration: 390 Train loss: 2.214712 Train acc: 0.360000
Epoch: 7/600 Iteration: 390 Validation loss: 1.687847 Validation acc: 0.543333
Epoch: 7/600 Iteration: 395 Train loss: 1.991967 Train acc: 0.430000
Epoch: 7/600 Iteration: 400 Train loss: 2.063411 Train acc: 0.420000
Epoch: 7/600 Iteration: 400 Validation loss: 1.668612 Validation acc: 0.542500
Epoch: 8/600 Iteration: 405 Train loss: 2.004390 Train acc: 0.420000
Epoch: 8/600 Iteration: 410 Train loss: 2.156096 Train acc: 0.390000
Epoch: 8/600 Iteration: 410 Validation loss: 1.651346 Validation acc: 0.541667
Epoch: 8/600 Iteration: 415 Train loss: 1.876970 Train acc: 0.470000
Epoch: 8/600 Iteration: 420 Train loss: 2.075485 Train acc: 0.310000
Epoch: 8/600 Iteration: 420 Validation loss: 1.634457 Validation acc: 0.540833
Epoch: 8/600 Iteration: 425 Train loss: 2.009559 Trai

Epoch: 15/600 Iteration: 755 Train loss: 1.703597 Train acc: 0.450000
Epoch: 15/600 Iteration: 760 Train loss: 1.822989 Train acc: 0.420000
Epoch: 15/600 Iteration: 760 Validation loss: 1.369974 Validation acc: 0.550000
Epoch: 15/600 Iteration: 765 Train loss: 1.619920 Train acc: 0.490000
Epoch: 15/600 Iteration: 770 Train loss: 1.645536 Train acc: 0.450000
Epoch: 15/600 Iteration: 770 Validation loss: 1.365642 Validation acc: 0.550000
Epoch: 15/600 Iteration: 775 Train loss: 1.676790 Train acc: 0.500000
Epoch: 15/600 Iteration: 780 Train loss: 1.891579 Train acc: 0.380000
Epoch: 15/600 Iteration: 780 Validation loss: 1.361409 Validation acc: 0.550000
Epoch: 15/600 Iteration: 785 Train loss: 1.601577 Train acc: 0.540000
Epoch: 15/600 Iteration: 790 Train loss: 1.650536 Train acc: 0.490000
Epoch: 15/600 Iteration: 790 Validation loss: 1.356931 Validation acc: 0.551667
Epoch: 15/600 Iteration: 795 Train loss: 1.644554 Train acc: 0.500000
Epoch: 15/600 Iteration: 800 Train loss: 1.438960 

Epoch: 22/600 Iteration: 1125 Train loss: 1.472625 Train acc: 0.560000
Epoch: 22/600 Iteration: 1130 Train loss: 1.629639 Train acc: 0.480000
Epoch: 22/600 Iteration: 1130 Validation loss: 1.258265 Validation acc: 0.579167
Epoch: 22/600 Iteration: 1135 Train loss: 1.495597 Train acc: 0.550000
Epoch: 22/600 Iteration: 1140 Train loss: 1.461803 Train acc: 0.540000
Epoch: 22/600 Iteration: 1140 Validation loss: 1.256047 Validation acc: 0.580000
Epoch: 22/600 Iteration: 1145 Train loss: 1.541895 Train acc: 0.490000
Epoch: 22/600 Iteration: 1150 Train loss: 1.481341 Train acc: 0.490000
Epoch: 22/600 Iteration: 1150 Validation loss: 1.253653 Validation acc: 0.581667
Epoch: 23/600 Iteration: 1155 Train loss: 1.452740 Train acc: 0.510000
Epoch: 23/600 Iteration: 1160 Train loss: 1.683107 Train acc: 0.460000
Epoch: 23/600 Iteration: 1160 Validation loss: 1.250184 Validation acc: 0.581667
Epoch: 23/600 Iteration: 1165 Train loss: 1.401970 Train acc: 0.570000
Epoch: 23/600 Iteration: 1170 Train l

KeyboardInterrupt: 

In [None]:
# Plot training and test loss
t = np.arange(iteration-1)

plt.figure(figsize = (6,6))
plt.plot(t, np.array(train_loss), 'r-', t[t % 10 == 0], np.array(validation_loss), 'b*')
plt.xlabel("iteration")
plt.ylabel("Loss")
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
# Plot Accuracies
plt.figure(figsize = (6,6))

plt.plot(t, np.array(train_acc), 'r-', t[t % 10 == 0], validation_acc, 'b*')
plt.xlabel("iteration")
plt.ylabel("Accuray")
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
test_acc = []
test_loss = []
test_pred = []

with tf.Session(graph=graph) as sess:
    # Restore
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints-cnn'))
    
    for x_t, y_t, x_t_f, x_t_p in get_batches(X_test, y_test, X_test_feats, X_test_p, 157):
        feed = {inputs_: x_t,
                labels_: y_t,
                inputs2_: x_t_f,
                inputs3_: x_t_p,
                keep_prob_: 1}
        
        batch_loss, batch_acc = sess.run([cost, accuracy], feed_dict=feed)
        test_acc.append(batch_acc)
        test_loss.append(batch_loss)
        
        predict=tf.argmax(logits,1)
        best = predict.eval(feed_dict=feed)
        test_pred.append(best)
    
    print("Test accuracy: {:.6f}".format(np.mean(test_acc)))
    print("Test loss: {:.6f}".format(np.mean(test_loss)))
    
    oof_preds = np.concatenate(test_pred, axis=0)

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
print(np.asarray(labels_test))

print(oof_preds+1)

cnf_matrix = confusion_matrix(np.asarray(labels_test), oof_preds+1)
sample_sub = pd.read_csv('sample_submission.csv')
class_names = list(sample_sub.columns[1:-1])
del sample_sub;gc.collect()
plt.figure(figsize=(12,12))
foo = plot_confusion_matrix(cnf_matrix, classes=class_names,normalize=True,
                      title='Confusion matrix')