### Imports and constants

In [1]:
import os
from torch import cuda, device

BASE_DIR = "./../"
DATA_DIR = os.path.join(BASE_DIR, "data/ru_squad")
SOURCES_DIR = os.path.join(BASE_DIR, "nti_ai_pipeline")
TRAIN_FILE_PATH = os.path.join(DATA_DIR, "sbersquad_train.json")
TEST_FILE_PATH = os.path.join(DATA_DIR, "sbersquad_test.json")
CACHE_DIR = "./cache_models/"

BATCH_SIZE = 2
DEVICE = device("cuda" if cuda.is_available() else "cpu")
TRANSFORMER_NAME = "DeepPavlov/rubert-base-cased"
MAX_ANSWER_LEN_IN_TOKENS = 27
DROPRATE = 0.7

MOCK = True # turn off if run real pipeline, turn on if testing that things work
DO_FIT = False

In [2]:
import sys
import importlib
import torch

sys.path.append(SOURCES_DIR)

import nti_squad
import pipeline
from torch.utils import data as torch_data
from nti_squad import data as squad_data
from nti_squad import modeling as squad_modeling
from pipeline.saving import local_saver


my_modules = [nti_squad, pipeline, squad_modeling, squad_data, local_saver]

for module in my_modules:
    importlib.reload(module)

In [3]:
import gc, torch

def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

### Load data

In [4]:
train_val_container = squad_data.QADataContainer(TRAIN_FILE_PATH)


test_size = 0.001 if MOCK else 0.2
train_data, test_data = train_val_container.train_test_split(shuffle=True, test_size=test_size)

train_dataset = squad_data.SQuADDataset(train_data)
val_dataset = squad_data.SQuADDataset(test_data)

train_loader = torch_data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = torch_data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

### Defining model and trainer

In [5]:
model = squad_modeling.TransformerQA(TRANSFORMER_NAME, droprate=DROPRATE, cache_dir=CACHE_DIR)
proc = squad_modeling.QADataProcessor(TRANSFORMER_NAME, max_answer_token_len=MAX_ANSWER_LEN_IN_TOKENS)
saver = local_saver.LocalSaver(save_dir='./saved_models')

manager = pipeline.modeling.ModelManager(model, proc, device=DEVICE, saver=saver)

In [6]:
if DO_FIT:
    validator = nti_squad.squad_evaluating.SQuADValidator()
    weights_updater = nti_squad.QAWeightsUpdater(lr=5e-5, accum_iters=5, lr_end=8e-6, warmup=400, optimizer_class=torch.optim.AdamW) # NOTE: had set to 0 from 1e-2
    trainer = pipeline.training.Trainer(validator, weights_updater)

### Fit

In [7]:
if DO_FIT:
    trainer.fit(train_loader, val_loader, manager, max_epoch=1, steps_betw_evals=1100, stop_patience=2)

In [8]:
free_memory()

### Pseudo labeling, unioning source train samples with pseudo labeled ones

In [9]:
test_container = nti_squad.data.QADataContainer(path=TEST_FILE_PATH)
test_dataset = squad_data.subm_dataset.SubmDataset(test_container.get_data())
if MOCK:
    test_dataset = torch_data.Subset(test_dataset, list(range(10)))
    
test_loader = torch_data.DataLoader(test_dataset, batch_size=2)

In [10]:
predictor = nti_squad.pseudo_labeling.SquadDataPredictor()
pseudo_labeler = nti_squad.pseudo_labeling.SQuADPseudoLabeler(predictor, num_samples=3)

In [11]:
pseudo_label_samples = pseudo_labeler.run(manager, test_loader)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

  start_preds[text_idx, question_start_idx:] = 0
  end_preds[text_idx, question_start_idx:] = 0



Preds in pseudo labeler from data predictor: [SubmPredWithProbs(probs=3.11226224899292, answer_start=4.0, answer_end=8.0, preds='м годам жилищное строительство распространилось'), SubmPredWithProbs(probs=3.2413578033447266, answer_start=0.0, answer_end=3.0, preds='К 1960-'), SubmPredWithProbs(probs=3.2815892696380615, answer_start=3.0, answer_end=25.0, preds='-м годам жилищное строительство распространилось на район восточнее Батарейной горы, ранее застроенный, преимущественно, мало'), SubmPredWithProbs(probs=3.3006906509399414, answer_start=4.0, answer_end=13.0, preds='м годам жилищное строительство распространилось на район восточнее Батарей'), SubmPredWithProbs(probs=3.0960874557495117, answer_start=6.0, answer_end=30.0, preds='жилищное строительство распространилось на район восточнее Батарейной горы, ранее застроенный, преимущественно, малоэтажными домами, значительная'), SubmPredWithProbs(probs=3.219414234161377, answer_start=25.0, answer_end=25.0, preds='мало'), SubmPredWithPro

In [13]:
pseudo_label_dataset = nti_squad.data.RawSamplesDataset(pseudo_label_samples)
unioned_train_dataset = torch.utils.data.ConcatDataset([train_dataset, pseudo_label_dataset])

#### Now can retrain model with pseudo labeled data

### Submitting

In [14]:
submitter = nti_squad.qa_submitter.QASubmitter(subm_dir=os.path.join(BASE_DIR, 'submissions'))

# test loader is taken from Pseudo labeling step
submitter.create_submission(
    manager, test_loader, subm_file_name="subm_single_bert_50000_steps.jsonl",
    )

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




# The End