#### Code to train models

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import ast
import glob
import cudf
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

pd.options.display.max_columns = 100
pandarallel.initialize(progress_bar=False, use_memory_fs=False)

In [None]:
from params import *

from data.preparation import prepare_train_data
from data.dataset import NLIDataset
from data.tokenization import get_tokenizer
from data.features import nli_features

from model_zoo.models import NLITransformer

from training.main_nli import k_fold

from utils.logger import prepare_log_folder, save_config, create_logger

## Data

In [None]:
df = prepare_train_data(root=DATA_PATH)
df.drop(['address', 'city', 'state', 'zip', 'country'], axis=1, inplace=True)
df = cudf.from_pandas(df)

In [None]:
EXP_FOLDER = LOG_PATH + "lvl_2/" + "2022-05-30/1/"
THRESHOLD = 0.01  # 0.001

FOLD = 0

In [None]:
df_p = cudf.read_csv(EXP_FOLDER + f'kept_pairs_{THRESHOLD}.csv')

df_p = df_p.merge(df, how="left", left_on="id_1", right_on="id")
df_p = df_p.merge(df, how="left", left_on="id_2", right_on="id", suffixes=("_1", "_2"))

print(f'Retrieved {numerize(len(df_p))} pairs')

In [None]:
df_p, FEATURES, STATS = nli_features(df_p)

In [None]:
df_p.columns

### Dataset

In [None]:
name = "xlm-roberta-base"

tokenizer = get_tokenizer(name)

In [None]:
dataset = NLIDataset(df_p, tokenizer, FEATURES)

In [None]:
# lens = []
# for idx in tqdm(range(10000)):
#     data = dataset[idx]
#     lens += [len(data['ref_ids']), len(data['pos_ids']), len(data['neg_ids'])]

# sns.distplot(lens)

In [None]:
df_p.head(1)

In [None]:
idx = np.random.choice(len(dataset))
data = dataset[idx]

data

## Model

In [None]:
model = NLITransformer(
    name, 
    nb_layers=1,
    d=64,
    num_classes=1,
    nb_features=len(FEATURES),
).eval()

In [None]:
ids = data["ids"].unsqueeze(0)
fts = data["fts"].unsqueeze(0)

In [None]:
logits = model(ids, fts=fts)

## Training

In [None]:
BATCH_SIZES = {
    "xlm-roberta-base": 16,
    "xlm-roberta-large": 16,
}

LRS = {
    "xlm-roberta-base": 4e-5,
    "xlm-roberta-large": 2e-5,
}

In [None]:
class Config:
    # General
    seed = 2222
    device = "cuda"
    
    # Splits
    k = 5
    random_state = 2222
    selected_folds = [0, 1, 2, 3, 4]
    folds_file = DATA_PATH + f"folds_0_{k}.csv"

    # Texts
    features = FEATURES
    max_len = 100
    lower = False  # TODO

    # Architecture
    name = "xlm-roberta-base"  # large ?
    pretrained_weights = None
    no_dropout = False
    nb_features = len(features)
    nb_layers = 1
    d = 384 if nb_layers == 1 else 768
    num_classes = 1

    # Training    
    loss_config = {
        "name": "bce",
        "activation": "sigmoid",
    }

    data_config = {
        "batch_size": BATCH_SIZES[name],
        "val_bs": BATCH_SIZES[name] * 2,
        "use_len_sampler": False,
        "pad_token": 1 if "roberta" in name else 0,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": LRS[name],
        "lr_transfo": LRS[name],
        "lr_decay": 1,
        "warmup_prop": 0.1,
        "weight_decay": 1,
        "betas": (0.5, 0.99),
        "max_grad_norm": 1.,
    }

    gradient_checkpointing = False
    acc_steps = 1
    epochs = 1

    use_fp16 = True

    verbose = 1
    verbose_eval = 10000

In [None]:
DEBUG = False
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH + "nli/")
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    df.to_csv(log_folder + 'data.csv', index=False)
    create_logger(directory=log_folder, name="logs.txt")

pred_oof = k_fold(
    Config,
    df_p,
    log_folder=log_folder
)

Done ! 