In [1]:
storage_name = "./prediction_06062021.txt"

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tempfile

import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import metrics
from tensorflow.keras.layers import Dense

In [3]:
class ArielChallengeMetricNumerator(metrics.Metric):
    def __init__(self, name="ariel_challenge_metric_num", **kwargs):
        super(ArielChallengeMetricNumerator, self).__init__(name=name, **kwargs)
        self.weight = self.add_weight(name="ctp", initializer="zeros")
    #y_true=y, y_pred=pred

    def update_state(self, y_true, y_pred, sample_weight=None):  # score()
        """
        Return the unweighted score related to the challenge. 
        Taken from https://github.com/ucl-exoplanets/ML-challenge-baseline/blob/main/utils.py

        :param y_true: (float) The true label output. 
        :param y_pred: (float) The predicted output. 
        """

        # y_pred = tf.reshape(tf.argmax(y_pred, axis=1), shape=(-1, 1))
        # values = tf.cast(y_true, "int32") == tf.cast(y_pred, "int32")
        # values = tf.cast(values, "float32")

        y_true = tf.cast(y_true, "float32")
        y_pred = tf.cast(y_pred, "float32")

        if sample_weight is not None:
            sample_weight = tf.cast(sample_weight, "float32")
        else:
            sample_weight = tf.ones_like(y_true, dtype="float32")
        
        # new_weight = 1e4 - (tf.reduce_sum(2 * (sample_weight * y_true * tf.math.abs(y_pred - y_true)), axis=None) / (tf.reduce_sum(sample_weight, axis=None)) * 1e6)
        new_weight = tf.reduce_mean(sample_weight * y_true * tf.math.abs(y_pred - y_true))
#         new_weight = tf.cast(new_weight, "float32")
        
        self.weight.assign_add(new_weight)

    def result(self):  # __call__
        return self.weight

    def reset_state(self):
        # The state of the metric will be reset at the start of each epoch.
        self.weight.assign(0.0)

In [5]:
batch_size = 157  # 1256 is divisible by 157. 
test_batch_size = 77  # 539 is divisible by 77.

In [7]:
required_test_df = pd.DataFrame()

for table_number in range(55):
    df = pd.read_csv(f"./csv_files/flatten_train_{table_number}.csv", header=None)

    target = df.pop(30000)  # final column is the target/label. 

    train_mean = df.mean()

    mod = abs(df).max()
    # mod_target = abs(target).max()
    
    df -= train_mean
    df /= mod
    # target /= mod_target

    dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))

    train_dataset = dataset.cache().shuffle(len(df)).batch(batch_size)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    tf.keras.backend.clear_session()
    try:
        del model
    except NameError:
        pass

    model = tf.keras.Sequential(
    [
        #306 (incluing extra params for now)
        Dense(4, input_shape=(len(df.columns),), activation=tf.nn.relu),   
        Dense(1024, activation=tf.nn.relu),
        Dense(256, activation=tf.nn.relu),
        Dense(1, activation=tf.nn.sigmoid)
        ])

    model.compile(
        optimizer="adam", 
        loss = 'mean_squared_error', 
        metrics=[ArielChallengeMetricNumerator()]
    )
    
    # callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, min_delta=0.0002, mode="auto", baseline=None, restore_best_weights=True)

    model.fit(train_dataset, epochs=40, batch_size=batch_size, verbose=2)

    df_test = pd.read_csv(f"./csv_files_test/flatten_test_{table_number}.csv", header=None)

    df_test -= train_mean
    df_test /= mod

    what_is_this = model.predict(df_test, batch_size=test_batch_size)

    required_test_df = pd.concat([required_test_df, pd.DataFrame(what_is_this)], axis=1)
    required_test_df.to_csv(storage_name, header=False, index=False, sep="\t")
    
    print(f"\nTable {table_number} completed.\n")
    
    del df, df_test, target, what_is_this

Epoch 1/40
8/8 - 1s - loss: 0.1622 - ariel_challenge_metric_num: 0.1467
Epoch 2/40
8/8 - 0s - loss: 0.0324 - ariel_challenge_metric_num: 0.0601
Epoch 3/40
8/8 - 0s - loss: 0.0030 - ariel_challenge_metric_num: 0.0266
Epoch 4/40
8/8 - 0s - loss: 0.0036 - ariel_challenge_metric_num: 0.0303
Epoch 5/40
8/8 - 0s - loss: 0.0038 - ariel_challenge_metric_num: 0.0313
Epoch 6/40
8/8 - 0s - loss: 0.0039 - ariel_challenge_metric_num: 0.0316
Epoch 7/40
8/8 - 0s - loss: 0.0039 - ariel_challenge_metric_num: 0.0316
Epoch 8/40
8/8 - 0s - loss: 0.0039 - ariel_challenge_metric_num: 0.0316
Epoch 9/40
8/8 - 0s - loss: 0.0039 - ariel_challenge_metric_num: 0.0316
Epoch 10/40
8/8 - 0s - loss: 0.0038 - ariel_challenge_metric_num: 0.0315
Epoch 11/40
8/8 - 0s - loss: 0.0038 - ariel_challenge_metric_num: 0.0314
Epoch 12/40
8/8 - 0s - loss: 0.0038 - ariel_challenge_metric_num: 0.0312
Epoch 13/40
8/8 - 0s - loss: 0.0038 - ariel_challenge_metric_num: 0.0311
Epoch 14/40
8/8 - 0s - loss: 0.0037 - ariel_challenge_metric

Though it seems like the training for these number of epochs (30) have not decrease to the minimum. So the number of epochs increase to 40. This could be further tuned if required. However, afraid that it might overfit, we will stop there and see how it does. 

In [8]:
required_test_df.to_csv(storage_name, header=False, index=False, sep="\t")