# Train 1DCNN on TPU

When I look sharing code, many people use LightGBM.
I want to try NN approach because I like it.

To save time and memory, I converted train.csv to a numpy array beforehand. ([dataset link](https://www.kaggle.com/takamichitoda/ump-npy-dataset))

This dataset made from [this notebook](https://www.kaggle.com/takamichitoda/ump-train-csv-to-npy).  

The model architecture was based on Mr. @sishihara's notebook.  
https://www.kaggle.com/sishihara/1dcnn-for-tabular-from-moa-2nd-place

Thanks:)



`update`
- Version 4: baseline, CV=0.9105 / LB=0.135
- Version 6: add dropout, CV=0.9101 / LB=0.132
- Version 8: dropout ratio 0.2 -> 0.1, CV=0.9135 / LB=0.117
- Version 9: dropout ratio 0.1 -> 0.4, CV=0.9142 / LB=0.125
- Version 11: remove dropout & [add lag feature](https://www.kaggle.com/takamichitoda/ump-lag-freatures)
- Version 15: remove lag feature & use small batch, StratifiedKFold, ReduceLROnPlateau
- Version 16: batch=4096, use correlationLoss
- Version 17: use MSE loss, small model(param 1/4)
- Version 20: TimeSeriesSplit
- Version 21: MC Dropout(0.75), large model
- Version 22: MC Dropout(0.75), small model, correlationLoss
- Version 22: skip connect model

In [None]:
import os
import gc
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K


tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

print('Running on TPU ', tpu.master())
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
class GCF:
    INPUT_ROOT = "/kaggle/input/ump-npy-dataset/"
    LAG_FEATURES = "/kaggle/input/ump-lag-freatures/target_shift_1.npy"
    #TIME_ID_LIMIT = 500
    N_TRAIN = 1_500_000
    N_FOLDS = 5
    SEED = 0
    
    N_EPOCHS = 1000
    BATCH_SIZE = 4096
    EARLY_STOPPING_PATIENCE = 10
    EARLY_STOPPING_MIN_DELTA = 1e-3

In [None]:
def seed_everything(seed=GCF.SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
%%time

X = np.load(f"{GCF.INPUT_ROOT}/features_std_scaled.npy")
y = np.load(f"{GCF.INPUT_ROOT}/targets.npy")
time_id = np.load(f"{GCF.INPUT_ROOT}/time_id.npy")
#investment_id = np.load(f"{GCF.INPUT_ROOT}/investment_id.npy")

# Use only newer data to save memory.
#X = X[time_id > GCF.TIME_ID_LIMIT, :]
#y = y[time_id > GCF.TIME_ID_LIMIT]
#investment_id = investment_id[time_id > GCF.TIME_ID_LIMIT]
#time_id = time_id[time_id > GCF.TIME_ID_LIMIT]
#gc.collect()

In [None]:
# https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302977

def correlationMetric(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

def correlationLoss(x,y, axis=-2):
    """Loss function that maximizes the pearson correlation coefficient between the predicted values and the labels,
    while trying to have the same mean and variance"""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xsqsum = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    ysqsum = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xsqsum * ysqsum)
    sqdif = tf.reduce_sum(tf.math.squared_difference(x, y), axis=axis) / n / tf.sqrt(ysqsum / n)
    return tf.convert_to_tensor( K.mean(tf.constant(1.0, dtype=x.dtype) - corr + (0.01 * sqdif)) , dtype=tf.float32 )


#　https://www.kaggle.com/c/ubiquant-market-prediction/discussion/301987
def pearson_coef(data):
    return data.corr()['target']['preds']

def comp_metric(time_id, y, pred):
    return np.mean(
        pd.DataFrame(np.stack([time_id, y, pred]).T, columns=['time_id', 'target', 'preds']
    ).groupby('time_id').apply(pearson_coef))

In [None]:
# https://www.kaggle.com/sishihara/1dcnn-for-tabular-from-moa-2nd-place
def create_model():
    model = keras.Sequential([
        layers.Dense(4096//4, activation='relu', input_shape=(300,)),
        layers.Reshape((256//4, 16)),
        layers.Dropout(0.75),
        layers.Conv1D(filters=16, kernel_size=5, strides=1, activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='linear'),
    ])
    
    
    model.compile(
        optimizer=tf.optimizers.Adam(1e-4),
        loss='mse',
        #loss=correlationLoss,
        metrics=[keras.metrics.RootMeanSquaredError(), correlationMetric]
    )
    
    return model

In [None]:
seed_everything()

#kf = StratifiedKFold(5, shuffle=True, random_state=GCF.SEED)
kf = TimeSeriesSplit(n_splits=GCF.N_FOLDS, max_train_size=GCF.N_TRAIN)

rmse_lst, score_lst = [], []
#oof = np.zeros((len(y),))
#for fold, (train_idx, valid_idx) in enumerate(kf.split(X, investment_id)):
for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    with strategy.scope():
        model = create_model()

    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_correlationMetric',
        patience=GCF.EARLY_STOPPING_PATIENCE,
        min_delta=GCF.EARLY_STOPPING_MIN_DELTA,
        restore_best_weights=True,
    )
    reduce_lr = ReduceLROnPlateau(
                        #monitor='val_loss',
                        monitor='val_correlationMetric',
                        factor=0.5,
                        patience=3,
                        min_lr=1e-5,
                        verbose=1
    )

    history = model.fit(
        X[train_idx, :], y[train_idx],
        validation_data=(X[valid_idx, :], y[valid_idx]),
        batch_size=GCF.BATCH_SIZE,
        epochs=GCF.N_EPOCHS,
        callbacks=[early_stopping, reduce_lr],
    )
    
    #oof[valid_idx] = model.predict(X[valid_idx, :]).reshape(1, -1)[0]
    valid_pred = model.predict(X[valid_idx, :]).reshape(1, -1)[0]
    
    rmse = mean_squared_error(y[valid_idx], valid_pred, squared=False)
    score = comp_metric(time_id[valid_idx], y[valid_idx], valid_pred)
    print(f'Fold-{fold}: RMSR={rmse}, SCORE={score}')
    
    pd.DataFrame(history.history)[['loss', 'val_loss']].plot()
    plt.title("loss")
    plt.show()
    
    pd.DataFrame(history.history)[['root_mean_squared_error', 'val_root_mean_squared_error']].plot()
    plt.title("rmse")
    plt.show()
    
    pd.DataFrame(history.history)[['correlationMetric', 'val_correlationMetric']].plot()
    plt.title("correlation")
    plt.show()
    
    model.save(f"ump_1dcnn_f{fold}.h5")
    rmse_lst.append(rmse)
    score_lst.append(score)

In [None]:
print(rmse_lst, np.mean(rmse_lst))
print(score_lst, np.mean(score_lst))

In [None]:
!ls