**NOTE:** This Notebook is downloaded from Kaggle and is therefore intended to be used as a Kaggle Kernel

# 📦 Packages and Basic Setup

In [1]:
%%capture
# -------- Basic Packages -------- #
import os
import gc
import sys
gc.enable()
import math
import time
import torch
import numpy as np
import pandas as pd
from sklearn import model_selection
!pip install --upgrade -q transformers tokenizers

# -------- Output Prettification ✨ -------- #
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()

# -------- Custom Library -------- #
wrapperdir = "../input/d/sauravmaheshkar/coffee"
sys.path.append(wrapperdir)

# -------- Weights and Biases Setup -------- #
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key=api_key);

2021-10-17 12:46:53.525584: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


# 📃 Configuration

In [2]:
CONFIG = dict(
    # Model
    model_type = 'rembert',
    model_name_or_path = "google/rembert",
    config_name = "google/rembert",
    output_head_dropout_prob = 0.0, 
    gradient_accumulation_steps = 2,
    # Tokenizer
    tokenizer_name = "google/rembert",
    max_seq_length = 384,
    doc_stride = 128,
    # Training
    epochs = 1,
    folds = 4,
    train_batch_size = 2,
    eval_batch_size = 8,
    # Optimizer
    optimizer_type = 'AdamW',
    learning_rate = 1.5e-5,
    weight_decay = 1e-2,
    epsilon = 1e-8,
    max_grad_norm = 1.0,
    # Scheduler
    decay_name = 'cosine-warmup',
    warmup_ratio = 0.1,
    logging_steps = 100,
    # Misc
    output_dir = 'output',
    seed = 21,
    # W&B 
    competition = 'chaii',
    _wandb_kernel = 'sauravm'
)

# 💿 Dataset

In [3]:
train = pd.read_csv('../input/d/sauravmaheshkar/coffee/data/official_data/train.csv')
test = pd.read_csv('../input/d/sauravmaheshkar/coffee/data/official_data/test.csv')
external_mlqa = pd.read_csv('../input/d/sauravmaheshkar/coffee/data/external_data/mlqa_hindi.csv')
external_xquad = pd.read_csv('../input/d/sauravmaheshkar/coffee/data/external_data/xquad.csv')
external_train = pd.concat([external_mlqa, external_xquad])

def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data['language'])):
        data.loc[v_, 'kfold'] = f
    return data

train = create_folds(train, num_splits=5)
external_train["kfold"] = -1
external_train['id'] = list(np.arange(1, len(external_train)+1))
train = pd.concat([train, external_train]).reset_index(drop=True)

def convert_answers(row):
    return {'answer_start': [row[0]], 'text': [row[1]]}

train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

# ⚙️ Helper Function

In [4]:
%%capture
from coffee.helpers import make_model, make_loader, make_optimizer, make_scheduler
from coffee.utils import set_seed

def init_training(args, data, fold):
    set_seed(CONFIG["seed"])
    
    if not os.path.exists(CONFIG["output_dir"]):
        os.makedirs(CONFIG["output_dir"])
    
    # model
    model_config, tokenizer, model = make_model(args)
    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')
        
    print("✅ Model Initialized")
    
    # data loaders
    train_dataloader, valid_dataloader = make_loader(args, data, tokenizer, fold)
    
    print("✅ DataLoaders Initialized")

    # optimizer
    optimizer = make_optimizer(args, model, strategy = 'a')
    
    print("✅ Optimizer Initialized")

    # scheduler
    num_training_steps = math.ceil(len(train_dataloader) / CONFIG["gradient_accumulation_steps"]) * CONFIG["epochs"]
    if CONFIG["warmup_ratio"] > 0:
        num_warmup_steps = int(CONFIG["warmup_ratio"] * num_training_steps)
    else:
        num_warmup_steps = 0
    print(f"Total Training Steps: {num_training_steps}, Total Warmup Steps: {num_warmup_steps}")
    
    scheduler = make_scheduler(args, optimizer, num_warmup_steps, num_training_steps)
    
    print("✅ Scheduler Initialized")
    
    result_dict = {
        'epoch':[], 
        'train_loss': [], 
        'val_loss' : [], 
        'best_val_loss': np.inf
    }

    return (
        model, model_config, tokenizer, optimizer, scheduler, 
        train_dataloader, valid_dataloader, result_dict
    )

# 🔥 Training

In [5]:
from coffee.engine import Trainer, Evaluator

def run(data, fold):
    args = CONFIG
    
    run = wandb.init(project='chaii', 
                     entity='sauravmaheshkar',
                     group='stride&seqlen', 
                     job_type='train',
                     config=CONFIG)
    
    model, model_config, tokenizer, optimizer, scheduler, train_dataloader, \
        valid_dataloader, result_dict = init_training(args, data, fold)
    
    wandb.watch(model)
    
    trainer = Trainer(model, tokenizer, optimizer, scheduler)
    evaluator = Evaluator(model)

    train_time_list = []
    valid_time_list = []

    for epoch in range(CONFIG["epochs"]):
        result_dict['epoch'].append(epoch)

        # Train
        torch.cuda.synchronize()
        tic1 = time.time()
        result_dict = trainer.train(
            args, train_dataloader, 
            epoch, result_dict
        )
        torch.cuda.synchronize()
        tic2 = time.time() 
        train_time_list.append(tic2 - tic1)
        
        # Evaluate
        torch.cuda.synchronize()
        tic3 = time.time()
        result_dict = evaluator.evaluate(
            valid_dataloader, epoch, result_dict
        )
        torch.cuda.synchronize()
        tic4 = time.time() 
        valid_time_list.append(tic4 - tic3)
            
        output_dir = os.path.join(CONFIG["output_dir"], f"checkpoint-fold-{fold}")
        if result_dict['val_loss'][-1] < result_dict['best_val_loss']:
            print("{} Epoch, Best epoch was updated! Valid Loss: {: >4.5f}".format(epoch, result_dict['val_loss'][-1]))
            result_dict["best_val_loss"] = result_dict['val_loss'][-1]        
            
            os.makedirs(output_dir, exist_ok=True)
            torch.save(model.state_dict(), f"{output_dir}/pytorch_model.bin")
            model_config.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            print(f"Saving model checkpoint to {output_dir}.")
            
        print()

    evaluator.save(result_dict, output_dir)
    
    print(f"Total Training Time: {np.sum(train_time_list)}secs, Average Training Time per Epoch: {np.mean(train_time_list)}secs.")
    print(f"Total Validation Time: {np.sum(valid_time_list)}secs, Average Validation Time per Epoch: {np.mean(valid_time_list)}secs.")
    
    torch.cuda.empty_cache()
    del trainer, evaluator
    del model, model_config, tokenizer
    del optimizer, scheduler
    del train_dataloader, valid_dataloader, result_dict
    gc.collect()
    
    run.finish()

In [6]:
for fold in range(CONFIG["folds"]):
    print();print()
    print('-'*50)
    print(f'FOLD: {fold}')
    print('-'*50)
    run(train, fold)

[34m[1mwandb[0m: Currently logged in as: [33msauravmaheshkar[0m (use `wandb login --relogin` to force relogin)




--------------------------------------------------
FOLD: 0
--------------------------------------------------


[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-10-17 12:47:01.509449: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


Downloading:   0%|          | 0.00/686 [00:00<?, ?B/s]


CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Downloading:   0%|          | 0.00/263 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
✅ Model Initialized
Num examples Train= 20511, Num examples Valid=2934
✅ DataLoaders Initialized
✅ Optimizer Initialized
Total Training Steps: 5128, Total Warmup Steps: 512
✅ Scheduler Initialized
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training Loss,0.00034
_runtime,5851.0
_timestamp,1634480670.0
_step,10622.0
Validation Loss,3.26942


0,1
Training Loss,▃▄▃▃▁▁▃▂▇▂▁▁▄▁▁▁▁▃█▁▄▁▁▁▅▁▁▁▁▁▁▇▁▁▁▂▂▁▁▁
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Validation Loss,▁█▁▁▁▅▁▁▁▂▁▁▁▁▁▁▄▁▁▁▁▅▁▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁




--------------------------------------------------
FOLD: 1
--------------------------------------------------
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-10-17 14:24:51.904598: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0



CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
✅ Model Initialized
Num examples Train= 20422, Num examples Valid=3023
✅ DataLoaders Initialized
✅ Optimizer Initialized
Total Training Steps: 5106, Total Warmup Steps: 510
✅ Scheduler Initialized
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training Loss,0.00031
_runtime,5751.0
_timestamp,1634486441.0
_step,10588.0
Validation Loss,0.00096


0,1
Training Loss,▃▁▁▅▄▅▁▁▂▁▄▁▁▅▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▂▁▁
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Validation Loss,▁▁▁▁▁▁▁▁▁▆▄▁▂▁▁▂▇█▁▁▁▁▂▁▁▃▁▁▁▁▁▄▁▂▅▁▁▁▁▁




--------------------------------------------------
FOLD: 2
--------------------------------------------------
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-10-17 16:01:04.787159: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0



CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
✅ Model Initialized
Num examples Train= 20536, Num examples Valid=2909
✅ DataLoaders Initialized
✅ Optimizer Initialized
Total Training Steps: 5134, Total Warmup Steps: 513
✅ Scheduler Initialized
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training Loss,0.21565
_runtime,5788.0
_timestamp,1634492250.0
_step,10631.0
Validation Loss,4.26682


0,1
Training Loss,▃▄▅▆▆▄▁▁▂▅▁▁▃▁▂▁▁▇▁█▁▅▁▁▃▅▁▂▁▂▄▁▁▁▁▃▁▁▂▁
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Validation Loss,█▁▁▂▁▁▁▁▄▁▁▁▂▃▂▁▂▁▁▁▁▂▁▁▂▂▂▁▂▁▂▁▁▅▂▄▁▁▄▁




--------------------------------------------------
FOLD: 3
--------------------------------------------------
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-10-17 17:37:54.019761: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0



CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
✅ Model Initialized
Num examples Train= 20096, Num examples Valid=3349
✅ DataLoaders Initialized
✅ Optimizer Initialized
Total Training Steps: 5024, Total Warmup Steps: 502
✅ Scheduler Initialized
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training Loss,0.1007
_runtime,5691.0
_timestamp,1634497962.0
_step,10466.0
Validation Loss,0.00074


0,1
Training Loss,▂▄▃█▁▁▁▁▁▂▂▁▅▅▂▂▃▃▁▁▃▁▄▃▆▁▂▁▂▁▁▁▁▁▁▁▄▂▄▃
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Validation Loss,▁█▂▃▁▁▆▁▁▁▁▂▁▁▃▃▁▁▁▁▁▁▁▁▂▁▁▂▁▁▃▁▁▁▄▁▄▁▁▁
