## Import Library

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import math

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import Sequence
import tensorflow.keras.layers as L

## Use GPU

In [None]:
# limit the GPU memory growth
gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
    tf.config.experimental.set_memory_growth(gpu[0], True)

## Data File Path

In [None]:
data_dir = Path('../input/seti-breakthrough-listen/')
train_data_dir = data_dir / 'train'
test_data_dir = data_dir / 'test'

train_label_file = data_dir / 'train_labels.csv'
sample_file = data_dir / 'sample_submission.csv'

In [None]:
train_data_dir

## Data ID

In [None]:
id_col = 'id'
target_col = 'target'

label = pd.read_csv(train_label_file, index_col=id_col)
sub = pd.read_csv(sample_file, index_col=id_col)

sub.shape

## Convert Data ID to File Path

In [None]:
def id_to_path(s, train=True):
    data_dir = train_data_dir if train else test_data_dir
    return data_dir / s[0] / f'{s}.npy'

### Vis


In [None]:
plt.figure(figsize=(24, 8))
for i in range(10):
    image = np.load(id_to_path(label.index[i])) # (6, 273, 256)
    image = image.astype(np.float32)
    image = np.vstack(image).transpose((1, 0)) # (1638, 256) -> (256, 1638)
    plt.subplot(5, 2, i + 1)
    plt.imshow(image)
plt.show()

### Data augmentation - Work in progress

In [None]:
# example of vertical shift image augmentation
from numpy import expand_dims
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import ImageDataGenerator
# load the image
# convert to numpy array
data = img_to_array(image)
# expand dimension to one sample
samples = expand_dims(data, 0)
# create image data augmentation generator
datagen = ImageDataGenerator(height_shift_range=0.5)
# prepare iterator
it = datagen.flow(samples, batch_size=1)
# generate samples and plot
for i in range(9):
    # define subplot
    plt.subplot(330 + 1 + i)
    # generate batch of images
    batch = it.next()
    # convert to unsigned integers for viewing
    image = batch[0].astype('uint8')
    # plot raw pixel data
    plt.imshow(data)
# show the figure
plt.show()

## Building a model

### Parameters

In [None]:
input_size = (273, 256, 3)
BATCH_SIZE = 32
n_epoch = 3
seed = 42 
VERBOSE= 0

### Sequence of Data

[tf.keras.utlis.Sequnece](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence)

In [None]:
class SETISequence(Sequence):
    def __init__(self, x_set, y_set=None, batch_size=32):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
        self.is_train = False if y_set is None else True
    
    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)
    
    def __getitem__(self, idx):
        batch_ids = self.x[idx * self.batch_size: (idx + 1) * self.batch_size]
        if self.y is not None:
            batch_y = self.y[idx * self.batch_size: (idx + 1) * self.batch_size]
        
        # taking channels 
        list_x = [np.load(id_to_path(x, self.is_train)) for x in batch_ids]
        batch_x = np.moveaxis(list_x,1,-1)
        #batch_x = batch_x.astype("float") / 255
        
        if self.is_train:
            return batch_x, batch_y
        else:
            return batch_x
        

In [None]:
def create_model():
    model = tf.keras.Sequential([
            L.Conv2D(3,(3,3), strides=(1,1), padding="same", activation='relu', input_shape=(273,256,6)),
            L.GlobalAveragePooling2D(),
            L.Dense(1, activation='sigmoid')
            ])

    #model.summary
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=5e-4),
                  loss='binary_crossentropy', metrics=[keras.metrics.AUC()])
    
    return model

## Train & Inferece with StratifiedKFold

In [None]:
sub[target_col] = 0

n_splits = 5
skf = StratifiedKFold(n_splits = n_splits)

x0 = label.index.values
y0 = label[target_col].values

x1 = sub.index.values
test = SETISequence(x1, batch_size=BATCH_SIZE)


for train_index,val_index in skf.split(x0,y0):
    x_train, x_val = x0[train_index], x0[val_index]
    y_train, y_val = y0[train_index], y0[val_index]

    train = SETISequence(x_train, y_train, batch_size=BATCH_SIZE)
    val = SETISequence(x_val, y_val, batch_size=BATCH_SIZE)
    

    model = create_model()
    
    print('training')
    model.fit(train, validation_data=val, epochs=n_epoch)

    prediction = model.predict(test).flatten()
    sub[target_col] += prediction / n_splits

In [None]:
#prediction = model.predict(test).flatten()
#sub[target_col] = prediction

sub.to_csv('submission.csv')
sub.shape

### References

[🛸 Signal Search 👽 - Exploratory Data Analysis](https://www.kaggle.com/ihelon/signal-search-exploratory-data-analysis)

[SETI Simple Code for Beginners(TensorFlow)](https://www.kaggle.com/kenjirokiyono/seti-simple-code-for-beginners-tensorflow)