# UMP Pytorch ResNet Training & Inference

Treat the challenge as a tabular data problem and use the [rtdl](https://github.com/Yura52/rtdl) package to implement the ResNet model described in the paper [Yury Gorishniy, Ivan Rubachev, Valentin Khrulkov, Artem Babenko, "Revisiting Deep Learning Models for Tabular Data”, 2021"](https://github.com/Yura52/tabular-dl-revisiting-models).

The preprocessing is taken from [columbia2131](https://www.kaggle.com/columbia2131)'s [Speed Up Reading (csv-to-pickle)](https://www.kaggle.com/code/columbia2131/speed-up-reading-csv-to-pickle/notebook) and [Takamichi Toda](https://www.kaggle.com/takamichitoda)'s [UMP Train Transformer on TPU](https://www.kaggle.com/code/takamichitoda/ump-train-transformer-on-tpu/) and [UMP train.csv to npy](https://www.kaggle.com/takamichitoda/ump-train-csv-to-npy).


In [None]:
lr = 1e-3 #1e-4 #0.001
weight_decay = 0.0001
batch_size = 1024*6

class CFG:
    CHECKPOINT = 'resnet_chkpt_20220418a'
    TRAIN = False
    INFER = True
    SPLITS = 7
    D_IN = 300
    D_MAIN = 256
    D_HIDDEN = 256
    DROPOUT_FIRST = 0.4
    DROPOUT_SECOND = 0.4
    N_BLOCKS = 8
    D_OUT = 1



In [None]:
import os
import gc
import numpy as np
import pickle
import random
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import torch
import torch.nn.functional as F

from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
import ubiquant
import time
import datetime
import logging.handlers
from torch.utils.tensorboard import SummaryWriter

if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost') != 'Localhost':
    # !pip install rtdl
    !pip install ../input/transwork003data/rtdl-0.0.13-py3-none-any.whl
import rtdl
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost') == 'Localhost':
    import zero
    zero.hardware.free_memory()

device = torch.device('cuda')
loss_fn = F.mse_loss
print(f'Num GPUs Available: {torch.cuda.device_count()}')


In [None]:
# Based on code in the book Deep Learning with PyTorch by Eli Stevens,
# Luca Antiga, and Thomas Viehmann, published by Manning Publications.
# https://www.manning.com/books/deep-learning-with-pytorch
def enumerateWithEstimate(_iter, desc_str, print_ndx=4,):
    iter_len = len(_iter)
    backoff = 2
    while backoff ** 7 < iter_len:
        backoff *= 2

    log.warning("{} ----/{}, starting".format(
        desc_str, iter_len,
    ))
    start_ts = time.time()
    for (current_ndx, item) in enumerate(_iter):
        yield (current_ndx, item)
        if current_ndx == print_ndx:
            duration_sec = ((time.time() - start_ts)
                            / (current_ndx - 1) * (iter_len)
                            )
            done_dt = datetime.datetime.fromtimestamp(start_ts + duration_sec)
            done_td = datetime.timedelta(seconds=duration_sec)
            log.info("{} {:-4}/{}, done at {}, {}".format(
                desc_str,
                current_ndx,
                iter_len,
                str(done_dt).rsplit('.', 1)[0],
                str(done_td).rsplit('.', 1)[0],
            ))
            print_ndx *= backoff

        if current_ndx + 1 == 0:
            start_ts = time.time()

    log.warning("{} ----/{}, done at {}".format(
        desc_str, iter_len,
        str(datetime.datetime.now()).rsplit('.', 1)[0],
    ))

features = [f'f_{i}' for i in range(300)]

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

# https://www.kaggle.com/competitions/ubiquant-market-prediction/discussion/302480

def apply_model(model, device, x_num):
    if isinstance(model, rtdl.FTTransformer):
        return model(x_num.to(device), None)
    elif isinstance(model, (rtdl.MLP, rtdl.ResNet)):
        return model(x_num.to(device))
    else:
        raise NotImplementedError(
            f'{type(model)} not implemented.'
        )

@torch.no_grad()
def evaluate(model, part):
    model.eval()
    prediction = []
    for batch in zero.iter_batches(X[part], batch_size):
        prediction.append(apply_model(model, device, batch))
    prediction = torch.cat(prediction).squeeze(1).cpu().numpy()
    target = y[part].cpu().numpy()
    score = sklearn.metrics.mean_squared_error(target, prediction) ** 0.5 * 1.0
    return score

@torch.no_grad()
def evaluate_pearson(model, part):
    model.eval()
    prediction = []
    for batch in zero.iter_batches(X[part], batch_size):
        prediction.append(apply_model(model, device, batch))
    prediction = torch.cat(prediction).squeeze(1).cpu().numpy()
    target = y[part].cpu().numpy()
    score = pearsonr(target, prediction)[0]
    return score


In [None]:
# https://www.kaggle.com/columbia2131/speed-up-reading-csv-to-pickle

if CFG.TRAIN and (not os.path.isfile('train.pkl')):
    def transform_csv2pickle(path, usecols, dtypes):
        train = pd.read_csv(
            path,
            usecols=usecols,
            dtype=dtypes
        )
        train.to_pickle('train.pkl')

    path = '../input/ubiquant-market-prediction/train.csv'
    basecols = ['row_id', 'time_id', 'investment_id', 'target']
    dtypes = {
        'row_id': 'str',
        'time_id': 'uint16',
        'investment_id': 'uint16',
        'target': 'float32',
    }
    for col in features:
        dtypes[col] = 'float32'

    transform_csv2pickle(path, basecols+features, dtypes)


In [None]:
# https://www.kaggle.com/takamichitoda/ump-train-csv-to-npy

if CFG.TRAIN and (\
        (not os.path.isfile('targets.npy')) or \
        (not os.path.isfile('time_id.npy')) or \
        (not os.path.isfile('investment_id.npy')) or \
        (not os.path.isfile('std_scaler.pkl')) or \
        (not os.path.isfile('robust_scaler.pkl')) or
        (not os.path.isfile('quantile_transformer.pkl')) or
        (not os.path.isfile('features_std_scaled.npy')) or
        (not os.path.isfile('features_robust_scaled.npy')) or
        (not os.path.isfile('features_quantile_transformer.npy'))):
    train_df = pd.read_pickle('train.pkl')

    y = train_df['target'].values
    time_id = train_df['time_id'].values
    investment_id = train_df['investment_id'].values
    del train_df['row_id'], train_df['time_id'], train_df['investment_id'], train_df['target']
    gc.collect()

    np.save('targets.npy', y)
    np.save('time_id.npy', time_id)
    np.save('investment_id.npy', investment_id)

    X = train_df[features].values

    del train_df
    gc.collect()

    std_scaler = StandardScaler()
    _X = std_scaler.fit_transform(X)
    pickle.dump(std_scaler, open("std_scaler.pkl", "wb"))
    np.save('features_std_scaled.npy', _X)

    robust_scaler = RobustScaler()
    _X = robust_scaler.fit_transform(X)
    pickle.dump(robust_scaler, open("robust_scaler.pkl", "wb"))
    np.save('features_robust_scaled.npy', _X)

    quantile_transformer = QuantileTransformer(
            output_distribution='normal',
            n_quantiles=max(min(X.shape[0] // 30, 1000), 10), # n_quantiles=100, # 1000
            subsample=1e9,
            random_state=42,
        )
    _X = quantile_transformer.fit_transform(X)
    pickle.dump(quantile_transformer, open("quantile_transformer.pkl", "wb"))
    np.save('features_quantile_transformer.npy', _X)

    del std_scaler, robust_scaler, quantile_transformer, _X
    gc.collect()



In [None]:
if CFG.TRAIN:
    X_all = np.load('features_std_scaled.npy')
    # X_all = np.load('features_robust_scaled.npy')
    # X_all = np.load('features_quantile_transformer.npy')
    investment_id_all = np.load('investment_id.npy')
    y_all = np.load('targets.npy')
    time_id_all = np.load('time_id.npy')
    zero.hardware.free_memory()
    seed_everything(42)

In [None]:
if CFG.TRAIN:
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    for handler in list(root_logger.handlers):
        root_logger.removeHandler(handler)
    logfmt_str = "%(asctime)s %(levelname)-8s pid:%(process)d %(name)s:%(lineno)03d:%(funcName)s %(message)s"
    formatter = logging.Formatter(logfmt_str)
    streamHandler = logging.StreamHandler()
    streamHandler.setFormatter(formatter)
    streamHandler.setLevel(logging.DEBUG)
    fileHandler = logging.FileHandler("log.txt")
    fileHandler.setLevel(logging.DEBUG)
    root_logger.addHandler(streamHandler)
    root_logger.addHandler(fileHandler)
    log = logging.getLogger(__name__)
    # Docs: https://yura52.github.io/zero/0.0.4/reference/api/zero.improve_reproducibility.html
    zero.improve_reproducibility(seed=123456)
    kfold = sklearn.model_selection.GroupKFold(n_splits=CFG.SPLITS)

    fold_scores = []
    models = []

    X = {}
    y = {}
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(X_all, y_all, groups=investment_id_all)):
        X['train'], y['train'] = torch.from_numpy(X_all[trn_idx]).clone(), torch.from_numpy(y_all[trn_idx]).clone()
        X['val'], y['val'] = torch.from_numpy(X_all[val_idx]).clone(), torch.from_numpy(y_all[val_idx]).clone()
        model = rtdl.ResNet.make_baseline(
            d_in=CFG.D_IN,
            d_main=CFG.D_MAIN,
            d_hidden=CFG.D_HIDDEN,
            dropout_first=CFG.DROPOUT_FIRST,
            dropout_second=CFG.DROPOUT_SECOND,
            n_blocks=CFG.N_BLOCKS,
            d_out=CFG.D_OUT,
        )
        model.to(device)
        optimizer = (
            model.make_default_optimizer()
            if isinstance(model, rtdl.FTTransformer)
            else torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        )
        # Create a dataloader for batches of indices
        # Docs: https://yura52.github.io/zero/reference/api/zero.data.IndexLoader.html
        # train_loader = zero.data.IndexLoader(len(X['train']), batch_size, device=device)
        train_loader = zero.data.IndexLoader(len(X['train']), batch_size, device='cpu')

        # Create a progress tracker for early stopping
        # Docs: https://yura52.github.io/zero/reference/api/zero.ProgressTracker.html
        # progress = zero.ProgressTracker(patience=100)
        progress = zero.ProgressTracker(patience=30)
        # print(f'Test score before training: {evaluate(model, "val"):.4f}')

        writer = SummaryWriter(flush_secs=30)
        checkpoint_file = CFG.CHECKPOINT + '_fold' + str(fold) + '.pt'
        zero.hardware.free_memory()
        n_epochs = 1000
        report_frequency = len(X['train']) // batch_size
        best_epoch = 1
        for epoch in range(1, n_epochs + 1):
            # def enumerateWithEstimate(_iter, desc_str, print_ndx=4,):
            # for iteration, batch_idx in enumerate(train_loader):
            for iteration, batch_idx in enumerateWithEstimate(train_loader, 'train'):
                model.train()
                optimizer.zero_grad()
                x_batch = X['train'][batch_idx]
                y_batch = y['train'][batch_idx]
                loss = loss_fn(apply_model(model, device, x_batch).squeeze(1), y_batch.to(device))
                loss.backward()
                optimizer.step()
            writer.add_scalar("train_loss", loss.item(), epoch)

            val_score = evaluate(model, 'val')
            val_pearson = evaluate_pearson(model, 'val')
            print(f'Epoch {epoch:03d} | val_score: {val_score:.4f} | val_pearson: {val_pearson:.4f}', end='')
            writer.add_scalar("val_score", val_score, epoch)
            writer.add_scalar("pearsonr", val_pearson, epoch)
            progress.update(-1 * val_score)
            # progress.update(val_pearson)
            if progress.success:
                best_epoch = epoch
                print(' <<< BEST VALIDATION EPOCH', end='')
                torch.save(
                    { 'model': model.state_dict(),
                      'optimizer': optimizer.state_dict(),
                      'random_state': zero.random.get_state(),
                      },
                    checkpoint_file,
                )
            print()
            if progress.fail:
                models.append(model)
                print(f"best epoch is {best_epoch}")
                break


In [None]:
if CFG.INFER:
    env = ubiquant.make_env()   # initialize the environment
    iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

    if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost') != 'Localhost':
        DATA_PATH = os.path.join('..', 'input', 'transwork003data')
    else:
        DATA_PATH = os.path.join('.')
    normalizer = pickle.load(open(os.path.join(DATA_PATH, 'std_scaler.pkl'), 'rb'))
    # normalizer = pickle.load(open(os.path.join(DATA_PATH, 'quantile_transformer.pkl'), 'rb'))
    models = []
    for fold in range(CFG.SPLITS):
        model = rtdl.ResNet.make_baseline(
            d_in=CFG.D_IN,
            d_main=CFG.D_MAIN,
            d_hidden=CFG.D_HIDDEN,
            dropout_first=CFG.DROPOUT_FIRST,
            dropout_second=CFG.DROPOUT_SECOND,
            n_blocks=CFG.N_BLOCKS,
            d_out=CFG.D_OUT,
        )
        # batch_size = 128 if isinstance(model, rtdl.FTTransformer) else 1024*6
        model.to(device)
        checkpoint_file = CFG.CHECKPOINT + '_fold' + str(fold) + '.pt'
        checkpoint = torch.load(os.path.join(DATA_PATH, checkpoint_file))
        model.load_state_dict(checkpoint['model'])
        device = torch.device('cuda')
        model.to(device)
        model.eval()
        models.append(model)
    print('Resuming from the checkpoint.\n')


In [None]:
if CFG.INFER:
    for (test_df, sample_prediction_df) in iter_test:
        x = normalizer.transform(test_df[features].values)
        preds = None
        ####
        with torch.no_grad():
            _x = torch.from_numpy(x.astype(np.float32)).clone().to(device)
            for model in models:
                if preds is None:
                    preds = apply_model(model, device, _x).reshape(-1, 1).cpu().numpy()
                else:
                    preds += apply_model(model, device, _x).reshape(-1, 1).cpu().numpy()
        pred = preds / CFG.SPLITS
        sample_prediction_df['target'] = pred  # make your predictions here
        env.predict(sample_prediction_df)   # register your predictions