In [None]:
# Install and import package
!pip install keras-tuner
!pip install kaggledatasets
# !pip install tensorflow_datasets

In [None]:
import os
import seaborn as sns
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.layers.experimental import preprocessing
from kaggle_datasets import KaggleDatasets
from kaggle_secrets import UserSecretsClient
from tensorflow.keras import layers


In [None]:
import collections
import copy
from keras_tuner.engine import tuner_utils
import numpy as np
import keras_tuner as kt

# Reimplement bayesianOptimization to run with tpu
class TpuBayesianOptimizationOracle(kt.oracles.BayesianOptimizationOracle):
    def _save_trial(self, trial):
        # Write trial status to trial directory
        trial_id = trial.trial_id
        # trial.save(os.path.join(self._get_trial_dir(trial_id), "trial.json"))

class TpuBayesianOptimization(kt.engine.multi_execution_tuner.MultiExecutionTuner):
    def __init__(
        self,
        hypermodel,
        objective,
        max_trials,
        num_initial_points=2,
        alpha=1e-4,
        beta=2.6,
        seed=None,
        hyperparameters=None,
        tune_new_entries=True,
        allow_new_entries=True,
        **kwargs
    ):
        oracle = TpuBayesianOptimizationOracle(
            objective=objective,
            max_trials=max_trials,
            num_initial_points=num_initial_points,
            alpha=alpha,
            beta=beta,
            seed=seed,
            hyperparameters=hyperparameters,
            tune_new_entries=tune_new_entries,
            allow_new_entries=allow_new_entries,
        )
        super(
            TpuBayesianOptimization,
            self,
        ).__init__(oracle=oracle, hypermodel=hypermodel, **kwargs)
        
    def run_trial(self, trial, *fit_args, **fit_kwargs):
        original_callbacks = fit_kwargs.pop("callbacks", [])
        # Run the training process multiple times.
        metrics = collections.defaultdict(list)
        for execution in range(self.executions_per_trial):
            copied_fit_kwargs = copy.copy(fit_kwargs)
            callbacks = self._deepcopy_callbacks(original_callbacks)
            self._configure_tensorboard_dir(callbacks, trial, execution)
            callbacks.append(tuner_utils.TunerCallback(self, trial))
            # Only checkpoint the best epoch across all executions.
            copied_fit_kwargs["callbacks"] = callbacks

            history = self._build_and_fit_model(trial, fit_args, copied_fit_kwargs)
            for metric, epoch_values in history.history.items():
                if self.oracle.objective.direction == "min":
                    best_value = np.min(epoch_values)
                else:
                    best_value = np.max(epoch_values)
                metrics[metric].append(best_value)

        # Average the results across executions and send to the Oracle.
        averaged_metrics = {}
        for metric, execution_values in metrics.items():
            averaged_metrics[metric] = np.mean(execution_values)
        self.oracle.update_trial(
            trial.trial_id, metrics=averaged_metrics, step=self._reported_step
        )

In [None]:
# Turn on tpu
# Detect TPU, return appropriate distribution strategy
strategy = tf.distribute.get_strategy() 

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# If you use private dataset, uncomment it
#user_secrets = UserSecretsClient()
#user_credential = user_secrets.get_gcloud_credential()
#user_secrets.set_tensorflow_credential(user_credential)


ds_name = [
    "setitfrecdatasettest02",
    "setitfrecdatasettest34",
    "setitfrecdatasettrain0",
    "setitfrecdatasettrain1",
    "setitfrecdatasettrain2"
]

ds_path = list(map(
    lambda name: KaggleDatasets().get_gcs_path(name),
    ds_name
))

train_filenames = tf.io.gfile.glob(list(map(
    lambda path: path + "/train*.tfrecords",
    ds_path
)))
#val_filenames = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec')
test_filenames = tf.io.gfile.glob(list(map(
    lambda path: path + "/test*.tfrecords",
    ds_path
)))


# List dir with real regex
# [x for x in os.listdir('.') if re.match('index_[0-9]*.csv', x)]

In [None]:
# Read train data
train_tfrec = tf.data.TFRecordDataset(train_filenames)

# Read val data
#val_tfrec = tf.data.TFRecordDataset(val_filenames)

# Read test dataset
test_tfrec = tf.data.TFRecordDataset(test_filenames)

In [None]:
# Parse an train example to get feature_description
for raw_record in train_tfrec.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    # print(example)

In [None]:
def split_dataset(dataset: tf.data.Dataset, validation_data_percent: int):
    """
    Splits a dataset of type tf.data.Dataset into a training and validation dataset using given ratio. Fractions are
    rounded up to two decimal places.
    @param dataset: the input dataset to split.
    @param validation_data_fraction: the fraction of the validation data as a float between 0 and 1.
    @return: a tuple of two tf.data.Datasets as (training, validation)
    """

    if not (0 <= validation_data_percent <= 100):
        raise ValueError("validation data percent must be ∈ [0,100]")

    dataset = dataset.enumerate()
    train_dataset = dataset.filter(lambda f, data: f % 100 > validation_data_percent)
    validation_dataset = dataset.filter(lambda f, data: f % 100 <= validation_data_percent)

    # remove enumeration
    train_dataset = train_dataset.map(lambda f, data: data)
    validation_dataset = validation_dataset.map(lambda f, data: data)

    return train_dataset, validation_dataset

In [None]:
# parse tfrecord to get feature and label
feature_description = {
    "image": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "image_id": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "target": tf.io.FixedLenFeature([], tf.int64, default_value=0),
}

def parse_labeled_data(example_proto):
    # Parse the input `tf.train.Example` proto using the dictionary above.
    parsed = tf.io.parse_single_example(example_proto, feature_description)
    image = tf.io.decode_raw(parsed["image"], tf.float16)
    image = tf.reshape(image, [6, 273, 256])
    # image = tf.transpose(image, [1, 0, 2])
    image = tf.reshape(image, (273*6, 256))
    image = tf.expand_dims(image, axis=2) # shape(273*6, 256, 1)
    return image, parsed["target"]

def parse_unlabeled_data(example_proto):
    # Parse the input `tf.train.Example` proto using the dictionary above.
    parsed = tf.io.parse_single_example(example_proto, feature_description)
    image = tf.io.decode_raw(parsed["image"], tf.float16)
    image = tf.reshape(image, [6, 273, 256])
    # image = tf.transpose(image, [1, 0, 2])
    image = tf.reshape(image, (273*6, 256))
    image = tf.expand_dims(image, axis=2) # shape(273*6, 256, 1)
    return image, parsed["image_id"]

dataset = train_tfrec.map(parse_labeled_data, num_parallel_calls=10)

train_dataset, val_dataset = split_dataset(dataset, 20)

train_dataset = train_dataset.shuffle(60000).batch(128)
train_dataset = train_dataset.prefetch(10)

val_dataset = val_dataset.shuffle(60000).batch(128).cache()
val_dataset = val_dataset.prefetch(10)

test_dataset = test_tfrec.map(parse_unlabeled_data).batch(32)
test_dataset = test_dataset.cache()
test_dataset = test_dataset.prefetch(10)

In [None]:
# Create model
def create_model():
    with strategy.scope():
        # dense = hp.Choice("dense", values=[128, 256, 384, 512, 768, 1024])
        # dropout = hp.Float("dropout", 0.2, 0.7)
        # lr = hp.Float("lr", 1e-4, 1e-7)
        # wd = hp.Float("wd", 1e-5, 1e-9)
        
        dense = 1408
        dropout = 0.30989813859771875
        lr = 0.0001
        wd = 1e-05
        
        pretrained_model = tf.keras.applications.efficientnet.EfficientNetB2(
            include_top=False, weights="imagenet"
        )

        model = tf.keras.Sequential([
            layers.Conv2D(3, (1, 1), input_shape=(273*6, 256, 1)),
            pretrained_model,
            layers.GlobalAveragePooling2D(),
            layers.Dropout(dropout),
            layers.Dense(dense, activation='relu'),
            layers.Dropout(dropout),
            layers.Dense(dense, activation='relu'),
            layers.Dropout(dropout),
            layers.Dense(1, activation='sigmoid')
        ])

        model.compile(
            optimizer=tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd),
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=[tf.keras.metrics.AUC()]
        )

    return model

'''
tuner = TpuBayesianOptimization(
    create_model,
    objective=kt.Objective("val_auc", direction="max"),
    max_trials=15,
    overwrite=True,
    directory="tuner",
    distribution_strategy=strategy,
    project_name="seti",
)
tuner.search_space_summary()
tuner.search(
    train_dataset,
    epochs=3, 
    validation_data=val_dataset
)
best_hp = tuner.get_best_hyperparameters()[0]
model = tuner.hypermodel.build(best_hp)
tuner.results_summary(num_trials=15)
'''

model = create_model()
model.summary()

In [None]:
# Train model
checkpoint_filepath = 'best_checkpoint'
options = tf.train.CheckpointOptions(experimental_io_device="/job:localhost")
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_auc',
    mode='max',
    save_best_only=True,
    options=options
)

model.fit(
    train_dataset, 
    epochs=20, 
    validation_data=val_dataset,
    callbacks=[model_checkpoint_callback]
)
# save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
# model.save('saved-model', options=save_locally)

In [None]:
# Load model, predict and write submission file
model.load_weights(checkpoint_filepath, options=options)

test_images_ds = test_dataset.map(lambda image, idnum: image)
predictions = model.predict(test_images_ds)
print(predictions)

print('Generating submission.csv file...')
test_ids_ds = test_dataset.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(np.size(predictions)))).numpy().astype('U') # all in one batch
data = {
    "id": test_ids,
}
submission = pd.DataFrame(data)
submission = submission.assign(target=predictions)
submission.to_csv("submission.csv", index=False)