In [1]:
# !python3 -m deeppavlov install squad_bert

### Imports and constants

In [3]:
import os
from torch import cuda, device

BASE_DIR = "./../../../"
DATA_DIR = os.path.join(BASE_DIR, "data/rucos")
SOURCES_DIR = os.path.join(BASE_DIR, "nti_ai_pipeline")
TRAIN_FILE_PATH = os.path.join(DATA_DIR, "rucos_train.jsonl")
VAL_FILE_PATH = os.path.join(DATA_DIR, "rucos_val.jsonl")
TEST_FILE_PATH = os.path.join(DATA_DIR, "rucos_test.jsonl")

BATCH_SIZE = 2
DEVICE = device("cuda" if cuda.is_available() else "cpu")
TRANSFORMER_NAME = "DeepPavlov/rubert-base-cased"

MOCK = True # turn off if run real pipeline, turn on if testing that things work

In [4]:
import sys
import importlib
import torch

sys.path.append(SOURCES_DIR)

import nti_rucos
import pipeline
from torch.utils import data as torch_data
from nti_rucos import modeling as rucos_modeling
from pipeline.saving import local_saver



my_modules = [nti_rucos, pipeline, rucos_modeling, local_saver]

for module in my_modules:
    importlib.reload(module)

### Loading data

In [5]:
container = nti_rucos.data.RucosContainer(TRAIN_FILE_PATH, VAL_FILE_PATH)

test_size = 0.001 if MOCK else 0.2
train_data, test_data = container.train_test_split(shuffle=True, test_size=test_size)

train_dataset = nti_rucos.data.RucosTrainDataset(train_data)
val_dataset = nti_rucos.data.RucosValDataset(test_data)

train_loader = torch_data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = torch_data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

### Pretraining model using MLM

In [6]:
pretrainer = nti_rucos.pretraining.MLMBertPretrainer(checkpoints_dir="./pretrains")

In [None]:
eval_every = 10 if MOCK else 20 * 1000
save_every = 10 if MOCK else 2 * 1000

path_to_best_pretrain_weights = pretrainer.pretrain(mname=TRANSFORMER_NAME, data_container=container,
                                                    batch_size=BATCH_SIZE, eval_every=eval_every, save_every=save_every)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456796.0, style=ProgressStyle(descri…




Some weights of BertForMaskedLM were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


### Defining model and trainer

In [5]:
proc = nti_rucos.modeling.RucosProcessor(TRANSFORMER_NAME, use_ner=False)
if MOCK:   
    model = rucos_modeling.mock_model.MockModel( # don't using SentPairBinaryClassifier because my laptop can't handle it with 12G of RAM
        use_ner=False
    )
else:
    model = rucos_modeling.SentPairBinaryClassifier(
        TRANSFORMER_NAME,
        droprate=0.2, 
        use_ner=False
    )
saver = pipeline.saving.local_saver.LocalSaver(save_dir='./saved_models')

manager = pipeline.modeling.ModelManager(model, proc, DEVICE, saver)

In [6]:
validator = nti_rucos.evaluating.RucosValidator()
# eval_val = validator.eval(manager, val_loader)

In [7]:
weights_updater = pipeline.WeightsUpdater(lr=7e-6, warmup=200, lr_end=1.7e-6, accum_iters=6, weight_decay=0.04, optimizer_class=torch.optim.AdamW) # NOTE: had set to 0 from 1e-2
trainer = pipeline.training.Trainer(validator, weights_updater)

### Running fit

In [8]:
# del model, manager
import gc, torch
gc.collect()
torch.cuda.empty_cache()

if MOCK:
    steps_betw_evals = 50
    max_step = 90
else:
    steps_betw_evals = 5000
    max_step = 50001
    
trainer.fit(train_loader, val_loader, manager, max_step=max_step, steps_betw_evals=steps_betw_evals, stop_patience=3)

HBox(children=(FloatProgress(value=0.0, max=344154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='eval', max=328.0, style=ProgressStyle(description_width='…


_eval. Eval_value: 0.12328767031431198
Saved model. Eval value: 0.12328767031431198 Name: 30277002-4edc-4201-b6b6-fc5e86464376


### Pseudo labeling

In [9]:
pseudo_labeler = nti_rucos.sources.pseudo_labeling.RegressionPseudoLabeler(chosen_proportion=0.15, 
                                                                            pos_to_neg_proportion=0.5)

In [10]:
test_container = nti_rucos.data.RucosContainer(path=os.path.join(DATA_DIR, 'rucos_test.jsonl'), has_labels=False,
                                                  query_placeholder_union_mode="concatenate",
                                                  )
test_dataset = nti_rucos.data.RucosSubmDataset(test_container.get_data(), switch_texts=True)
if MOCK:
    test_dataset = torch_data.Subset(test_dataset, list(range(1000)))
    
test_loader = torch_data.DataLoader(test_dataset, batch_size=2)

In [11]:
pseudo_label_samples = pseudo_labeler.run(manager, test_loader)

In [12]:
pseudo_label_dataset = nti_rucos.data.RawSamplesDataset(pseudo_label_samples, switch_texts=True)
unioned_train_dataset = torch.utils.data.ConcatDataset([train_dataset, pseudo_label_dataset])

#### So this way we obtained pseudo labels by our model and can retrain it if needed using them

### Submitting

In [13]:
# best_manager = trainer.load_best_manager()
# best_manager = managing_model.ModelManager.load(saver, '9ab8822d-1d50-4d8d-904b-2e7a70dca78f')
best_manager = manager

submitter = nti_rucos.rucos_submitting.RucosSubmitter(subm_dir=os.path.join(BASE_DIR, 'submissions'))

# test loader is taken from Pseudo labeling step
submitter.create_submission(
    best_manager, test_loader, subm_file_name="subm_single_bert_50000_steps.jsonl",
    )

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))


Some preds [SubmPred(text_id=tensor(0), probs=-0.4128408041101899, start=tensor(54), end=tensor(62), placeholder='Приштины'), SubmPred(text_id=tensor(0), probs=0.369328829334397, start=tensor(99), end=tensor(105), placeholder='Косово'), SubmPred(text_id=tensor(0), probs=-0.07764485496361115, start=tensor(332), end=tensor(353), placeholder='Югославской Федерации'), SubmPred(text_id=tensor(0), probs=0.05128829235378058, start=tensor(412), end=tensor(424), placeholder='Косовом поле'), SubmPred(text_id=tensor(0), probs=0.44465991394674786, start=tensor(479), end=tensor(497), placeholder='Слободан Милошевич'), SubmPred(text_id=tensor(0), probs=0.21535269173066704, start=tensor(706), end=tensor(713), placeholder='Белград'), SubmPred(text_id=tensor(0), probs=-0.525530045817756, start=tensor(723), end=tensor(725), placeholder='ЕС'), SubmPred(text_id=tensor(0), probs=-0.7779352258294525, start=tensor(816), end=tensor(822), placeholder='Сербии'), SubmPred(text_id=tensor(0), probs=0.230155216215