### Imports and constants

In [1]:
import os
from torch import cuda, device

BASE_DIR = "./../"
DATA_DIR = os.path.join(BASE_DIR, "data/ru_squad")
SOURCES_DIR = os.path.join(BASE_DIR, "nti_ai_pipeline")
TRAIN_FILE_PATH = os.path.join(DATA_DIR, "sbersquad_train.json")
TEST_FILE_PATH = os.path.join(DATA_DIR, "sbersquad_test.json")
CACHE_DIR = "./cache_models/"

BATCH_SIZE = 2
DEVICE = device("cuda" if cuda.is_available() else "cpu")
TRANSFORMER_NAME = "DeepPavlov/rubert-base-cased"
MAX_ANSWER_LEN_IN_TOKENS = 27
DROPRATE = 0.7

MOCK = True # turn off if run real pipeline, turn on if testing that things work

In [2]:
import sys
import importlib
import torch

sys.path.append(SOURCES_DIR)

import nti_squad
import pipeline
from torch.utils import data as torch_data
from nti_squad import data as squad_data
from nti_squad import modeling as squad_modeling
from pipeline.saving import local_saver


my_modules = [nti_squad, pipeline, squad_modeling, squad_data, local_saver]

for module in my_modules:
    importlib.reload(module)

In [3]:
import gc, torch

def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

### Load data

In [4]:
train_val_container = squad_data.QADataContainer(TRAIN_FILE_PATH)


test_size = 0.001 if MOCK else 0.2
train_data, test_data = train_val_container.train_test_split(shuffle=True, test_size=test_size)

train_dataset = squad_data.SQuADDataset(train_data)
val_dataset = squad_data.SQuADDataset(test_data)

train_loader = torch_data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = torch_data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

### Defining model and trainer

In [5]:
model = squad_modeling.TransformerQA(TRANSFORMER_NAME, droprate=DROPRATE, cache_dir=CACHE_DIR)
proc = squad_modeling.QADataProcessor(TRANSFORMER_NAME, max_answer_token_len=MAX_ANSWER_LEN_IN_TOKENS)
saver = local_saver.LocalSaver(save_dir='./saved_models')

manager = pipeline.modeling.ModelManager(model, proc, device=DEVICE, saver=saver)

In [6]:
validator = nti_squad.squad_evaluating.SQuADValidator()
weights_updater = nti_squad.QAWeightsUpdater(lr=5e-5, accum_iters=5, lr_end=8e-6, warmup=400, optimizer_class=torch.optim.AdamW) # NOTE: had set to 0 from 1e-2
trainer = pipeline.training.Trainer(validator, weights_updater)

### Fit

In [7]:
trainer.fit(train_loader, val_loader, manager, max_epoch=1, steps_betw_evals=1100, stop_patience=2)

HBox(children=(FloatProgress(value=0.0, max=5343.0), HTML(value='')))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/gldsn/.local/share/virtualenvs/nti-ai-pipeline-LA9ij60T/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-c5bba239d1c0>", line 1, in <module>
    trainer.fit(train_loader, val_loader, manager, max_epoch=1, steps_betw_evals=1100, stop_patience=2)
  File "./../nti_ai_pipeline/pipeline/training/train.py", line 40, in fit
    loss_val = self.weights_updater.fit_with_batch(model_manager, batch)
  File "./../nti_ai_pipeline/pipeline/training/weights_updater.py", line 55, in fit_with_batch
    loss = self._calc_loss(manager, inputs, labels)
  File "./../nti_ai_pipeline/pipeline/training/weights_updater.py", line 66, in _calc_loss
    preds, labels = manager.preproc_forward(inputs, labels)
  File "./../nti_ai_pipeline/pipeline/modeling/managing_model.py", line 46, in preproc_forward
    preds_raw = self.model(out.features)
  File "/

TypeError: object of type 'NoneType' has no len()

In [8]:
free_memory()




### Pseudo labeling, unioning source train samples with pseudo labeled ones

In [9]:
train_data[0]

('Лицензия оффшорного банка ( OBU ) позволяет осуществлять все виды банковской деятельности, но только в иностранной валюте и с нерезидентами (за некоторыми исключениями). Первая лицензия офшорному банку в Бахрейне была выдана в 1975 году. Первыми оффшорными банками в Бахрейне стали отделения Ситибанка и Альгемайне Банка. Количество оффшорных банков к 1984 г. возросло до 76, но затем снизилось до 48 к 2002 г. вследствие развития консолидации внутри банковских групп и между ними. К 2005 г. более 80 % суммарных активов банковского сектора Бахрейна приходилось на офшорные банковские структуры (offshore banking units — OBU). В офшорном режиме работают многие банки Бахрейна, а также отделения ближневосточных банков из Турции, Саудовской Аравии, ОАЭ и других стран, а также подразделения западных банковских групп (Citigroup, HSBC, J.P.Morgan Chase, Standard Chartered, BNP Paribas, Bank of Tokyo-Mitsubishi).',
 [('К какому году количество оффшорных банков снизилось до 48?',
   {'id': 'eac59768

In [10]:
test_container = nti_squad.data.QADataContainer(path=TEST_FILE_PATH)
test_dataset = squad_data.subm_dataset.SubmDataset(test_container.get_data())
if MOCK:
    test_dataset = torch_data.Subset(test_dataset, list(range(1000)))
    
test_loader = torch_data.DataLoader(test_dataset, batch_size=2)

In [None]:
predictor = pipeline.ensembling.DataPredictor(manager_call_kwargs={"postproc_kwargs": {"return_probs": True}})
pseudo_labeler = nti_squad.pseudo_labeling.QAPseudoLabeler(predictor, num_samples=3000)

In [None]:
pseudo_label_samples = pseudo_labeler.run(manager, test_loader)

In [None]:
pseudo_label_dataset = nti_squad.data.RawSamplesDataset(pseudo_label_samples, switch_texts=True)
unioned_train_dataset = torch.utils.data.ConcatDataset([train_dataset, pseudo_label_dataset])

### Submitting