#### Code to train models

- Missing values inputation
- Avg for embed instead of cls
- Keep nan info as token
- Arcface
- Train on address only

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ../src

/home/theo/kaggle/foursquare/src


## Imports

In [3]:
import os
import ast
import glob
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.notebook import tqdm

In [21]:
from params import *

from data.preparation import prepare_train_data, prepare_triplet_data
from data.dataset import TripletDataset
from data.tokenization import get_tokenizer

from model_zoo.models import SingleTransformer

from training.main import k_fold

from utils.logger import prepare_log_folder, save_config, create_logger

## Data

In [6]:
df = prepare_train_data(root=DATA_PATH)

In [7]:
triplets = prepare_triplet_data(root=DATA_PATH)

In [8]:
triplets.head()

Unnamed: 0,id,paired_ids,matches,pos_ids,neg_ids,fp_ids
0,E_000001272c6c5d,[E_da7fa3963561f8],[True],[E_da7fa3963561f8],[],"[E_c0c118c4e4a8d9, E_1870c55e5a0e0a, E_e36d2b2..."
1,E_000002eae2a589,[E_e80db432029aea],[True],[E_e80db432029aea],[],"[E_1a166b20ee88e8, E_8daa04281f771c, E_f782105..."
2,E_00001d92066153,[E_7e0d8e9138dd56],[True],[E_7e0d8e9138dd56],[],"[E_8855571b3371ac, E_3ff62765233a20, E_17df10c..."
3,E_000023d8f4be44,[E_12453effe251db],[True],[E_12453effe251db],[],"[E_f796550f3d3772, E_d9ae97c75cbf3c, E_07ac928..."
4,E_0000764d65557e,[E_0507e10f6eb526],[True],[E_0507e10f6eb526],[],"[E_79fdd25823bbe7, E_7a484204a0d68e, E_6dcba24..."


### Dataset

In [9]:
name = "xlm-roberta-base"

tokenizer = get_tokenizer(name)

In [10]:
dataset = TripletDataset(df, triplets, tokenizer)

In [11]:
# lens = []
# for idx in tqdm(range(10000)):
#     data = dataset[idx]
#     lens += [len(data['ref_ids']), len(data['pos_ids']), len(data['neg_ids'])]

# sns.distplot(lens)

In [12]:
idx = np.random.choice(len(dataset))
data = dataset[idx]

try:
    display(data['df'])
except:
    pass

## Model

In [13]:
model = SingleTransformer(
    name, 
    nb_layers=1,
    embed_dim=128,
    nb_features=0,
).eval()

In [14]:
ids = data["pos_ids"].unsqueeze(0)
fts = data["fts"][:, 0]

In [15]:
representation, projection = model(ids, fts=fts)

In [16]:
representation.size(), projection.size()

(torch.Size([1, 128]), torch.Size([1, 128]))

## Training

In [17]:
BATCH_SIZES = {
    "xlm-roberta-base": 8,
    "xlm-roberta-large": 8,
    "roberta-large": 8,
}

LRS = {
    "xlm-roberta-base": 2e-5,
    "xlm-roberta-large": 1e-5,
    "roberta-large": 1e-5,
}

In [18]:
class Config:
    # General
    seed = 2222
    device = "cuda"
    
    # Splits
    k = 2
    random_state = 2222
    selected_folds = [0, 1]
    folds_file = DATA_PATH + "folds_2.csv"

    # Architecture
    name = "xlm-roberta-large"  # large ?
    pretrained_weights = None
    no_dropout = False
    nb_features = 0  # 2 for longitude, lattitude
    nb_layers = 1
    embed_dim = 256  # 256

    # Texts
    max_len = 100
    use_name = True
    use_address = True
    use_url = False
    lower = False  # TODO

    # Training    
    loss_config = {
        "margin": 1.,
        "p": 2.,
    }

    data_config = {
        "batch_size": BATCH_SIZES[name],
        "val_bs": BATCH_SIZES[name] * 2,
        "use_len_sampler": False,
        "pad_token": 1 if "roberta" in name else 0,
    }

    optimizer_config = {
        "name": "AdamW",
        "lr": LRS[name],
        "lr_transfo": LRS[name],
        "lr_decay": 1,
        "warmup_prop": 0.1,
        "weight_decay": 1,
        "betas": (0.5, 0.99),
        "max_grad_norm": 1.,
    }

    gradient_checkpointing = False
    acc_steps = 1
    epochs = 1

    use_fp16 = True

    verbose = 1
    verbose_eval = 1000

In [19]:
DEBUG = False
log_folder = None

In [None]:
if not DEBUG:
    log_folder = prepare_log_folder(LOG_PATH)
    print(f'Logging results to {log_folder}')
    save_config(Config, log_folder + 'config')
    df.to_csv(log_folder + 'data.csv', index=False)
    create_logger(directory=log_folder, name="logs.txt")

k_fold(
    Config,
    df,
    triplets,
    log_folder=log_folder
)

Logging results to ../logs/2022-06-24/0/

-------------   Fold 1 / 2  -------------

    -> 356407 training triplets
    -> 35730 validation triplets
    -> 560218624 trainable parameters



Done ! 