In [1]:
!wget -O rucos.zip https://onti2020.ai-academy.ru/task/rucos.zip
!unzip rucos.zip
!pip install -q transformers seaborn

--2021-03-03 09:09:17--  https://onti2020.ai-academy.ru/task/rucos.zip
Resolving onti2020.ai-academy.ru (onti2020.ai-academy.ru)... 213.159.215.214
Connecting to onti2020.ai-academy.ru (onti2020.ai-academy.ru)|213.159.215.214|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56208690 (54M) [application/zip]
Saving to: ‘rucos.zip’


2021-03-03 09:09:23 (11.7 MB/s) - ‘rucos.zip’ saved [56208690/56208690]

Archive:  rucos.zip
   creating: RuCoS/
  inflating: __MACOSX/._RuCoS        
  inflating: RuCoS/.DS_Store         
  inflating: __MACOSX/RuCoS/._.DS_Store  
  inflating: RuCoS/rucos_test.jsonl  
  inflating: __MACOSX/RuCoS/._rucos_test.jsonl  
  inflating: RuCoS/rucos_val.jsonl   
  inflating: __MACOSX/RuCoS/._rucos_val.jsonl  
  inflating: RuCoS/rucos_train.jsonl  
  inflating: __MACOSX/RuCoS/._rucos_train.jsonl  
[K     |████████████████████████████████| 1.9MB 8.1MB/s 
[K     |████████████████████████████████| 890kB 68.2MB/s 
[K     |███████████████████████

# Data processing

In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
import re
import warnings
import seaborn as sns

from nltk import sent_tokenize
import nltk
nltk.download('punkt')
tqdm.pandas()

warnings.simplefilter("ignore")

SEED = 1337
TRAIN_BATCH_SIZE = 128
VAL_BATCH_SIZE = 512
SEQ_LEN = 220
MODEL_NAME = 'DeepPavlov/rubert-base-cased'
device = 'cuda:0'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  from pandas import Panel


In [2]:
with open('RuCoS/rucos_train.jsonl', 'r', encoding='utf-8') as f:
    train_content = f.read()

with open('RuCoS/rucos_test.jsonl', 'r', encoding='utf-8') as f:
    test_content = f.read()

with open('RuCoS/rucos_val.jsonl', 'r', encoding='utf-8') as f:
    val_content = f.read()

train_dict = [json.loads(jline) for jline in train_content.splitlines()]
test_dict = [json.loads(jline) for jline in test_content.splitlines()]
val_dict = [json.loads(jline) for jline in val_content.splitlines()]

In [3]:
train_prettified = {
    'text_id': [],
    'q_id': [],
    'a_id': [],
    'text': [],
    'query': [],
    'end': [],
    'start': [],
    'qtext': [],
    'label': []
}

val_prettified = {
    'text_id': [],
    'q_id': [],
    'a_id': [],
    'text': [],
    'query': [],
    'end': [],
    'start': [],
    'qtext': [],
    'label': []
}

test_prettified = {
    'text_id': [],
    'q_id': [],
    'a_id': [],
    'text': [],
    'query': [],
    'end': [],
    'start': [],
    'qtext': []
}

# --- TRAIN COLLECTION ---
for i, el in enumerate(tqdm(train_dict)):
    passage = el['passage']

    text_id = el['idx']
    q_id = el['qas'][0]['idx']
    text = passage['text']
    query = el['qas'][0]['query']
    
    a_id = 0
    for ent in passage['entities']:
        
        label = 0
        for ans in el['qas'][0]['answers']:
            if ans['end'] == ent['end'] and ans['start'] == ent['start']:
                label = 1
                break

        qtext = text[ent['start']:ent['end']]
        train_prettified['text_id'].append(text_id)
        train_prettified['q_id'].append(q_id)
        train_prettified['a_id'].append(a_id)
        train_prettified['text'].append(text)
        train_prettified['query'].append(query)
        train_prettified['end'].append(ent['end'])
        train_prettified['start'].append(ent['start'])
        train_prettified['qtext'].append(qtext)
        train_prettified['label'].append(label)

        a_id += 1
        
print('Number of train samples:', len(train_prettified['text_id']), '| Positive labels ratio:', sum(train_prettified['label'])/len(train_prettified['text_id']))

# --- VAL COLLECTION ---
for i, el in enumerate(tqdm(val_dict)):
    passage = el['passage']

    text_id = el['idx']
    q_id = el['qas'][0]['idx']
    text = passage['text']
    query = el['qas'][0]['query']
    
    a_id = 0
    for ent in passage['entities']:
        
        label = 0
        for ans in el['qas'][0]['answers']:
            if ans['end'] == ent['end'] and ans['start'] == ent['start']:
                label = 1
                break
        qtext = text[ent['start']:ent['end']]
        val_prettified['text_id'].append(text_id)
        val_prettified['q_id'].append(q_id)
        val_prettified['a_id'].append(a_id)
        val_prettified['text'].append(text)
        val_prettified['query'].append(query)
        val_prettified['end'].append(ent['end'])
        val_prettified['start'].append(ent['start'])
        val_prettified['qtext'].append(qtext)
        val_prettified['label'].append(label)

        a_id += 1

print('Number of train samples:', len(val_prettified['text_id']), '| Positive labels ratio:', sum(val_prettified['label'])/len(val_prettified['text_id']))

# --- TEST COLLECTION ---
for i, el in enumerate(tqdm(test_dict)):
    passage = el['passage']

    text_id = el['idx']
    q_id = el['qas'][0]['idx']
    text = passage['text']
    query = el['qas'][0]['query']
    
    a_id = 0
    for ent in passage['entities']:    
        qtext = text[ent['start']:ent['end']]    
        test_prettified['text_id'].append(text_id)
        test_prettified['q_id'].append(q_id)
        test_prettified['a_id'].append(a_id)
        test_prettified['text'].append(text)
        test_prettified['query'].append(query)
        test_prettified['end'].append(ent['end'])
        test_prettified['start'].append(ent['start'])
        test_prettified['qtext'].append(qtext)

        a_id += 1
print('Number of train samples:', len(test_prettified['text_id']))

HBox(children=(FloatProgress(value=0.0, max=72193.0), HTML(value='')))


Number of train samples: 918625 | Positive labels ratio: 0.1898794393795074


HBox(children=(FloatProgress(value=0.0, max=7577.0), HTML(value='')))


Number of train samples: 108749 | Positive labels ratio: 0.22910555499360913


HBox(children=(FloatProgress(value=0.0, max=7257.0), HTML(value='')))


Number of train samples: 96996


In [4]:
def prep(x):
    x =  re.split(r'\n@highlight\n', x)[0].replace('\n', ' ').replace('@header', '')
    return x

In [5]:
%%time
train = pd.DataFrame(train_prettified)
test = pd.DataFrame(test_prettified)
val = pd.DataFrame(val_prettified)

train = train[train['query'].apply(lambda x: x.count('@placeholder') == 1)].reset_index(drop=True)

train['text'] = train['text'].progress_apply(prep)
val['text'] = val['text'].progress_apply(prep)
test['text'] = test['text'].progress_apply(prep)

train['query_prep'] = train[['query', 'qtext']].apply(lambda x: x['query'].replace('@placeholder', x['qtext']), axis=1)
test['query_prep'] = test[['query', 'qtext']].apply(lambda x: x['query'].replace('@placeholder', x['qtext']), axis=1)
val['query_prep'] = val[['query', 'qtext']].apply(lambda x: x['query'].replace('@placeholder', x['qtext']), axis=1)

train = train.drop_duplicates(['text', 'query_prep'])
test = test.drop_duplicates(['text', 'query_prep'])
val = val.drop_duplicates(['text', 'query_prep'])

HBox(children=(FloatProgress(value=0.0, max=905279.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=108749.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=96996.0), HTML(value='')))


CPU times: user 24.9 s, sys: 1.44 s, total: 26.3 s
Wall time: 26.2 s


# SBERT preparation

In [6]:
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random
import os
from sklearn.metrics import f1_score
from collections import defaultdict

from torch.cuda.amp import autocast, GradScaler

In [7]:
def set_seed(seed = 42, set_torch=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    if set_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False 
set_seed(SEED)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

In [9]:
class CQDataset(Dataset):
    def __init__(self, texts, queries, targets, tokenizer, max_len=512):
        self.texts = texts
        self.queries = queries
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.texts)
  
    def __getitem__(self, item):
        text = str(self.texts[item])
        query = str(self.queries[item])
        target = self.targets[item]

        tt = self.tokenizer.tokenize(text)
        qt = self.tokenizer.tokenize(query)

        tokens = tt + ['[SEP]'] + qt
        tokens = tokens[-self.max_len+2:]
        tokens = ['[CLS]']+tokens+['[SEP]']+['[PAD]']*(self.max_len-2-len(tokens))
        attention_mask = [1*(token!='[PAD]') for token in tokens]
        fe = tokens.index('[SEP]')
        token_type_ids = [(i>=fe)*1 for i in range(len(tokens))]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        token_type_ids = torch.tensor(token_type_ids, dtype=torch.long)
        target = torch.tensor(target, dtype=torch.float16)

        return {
          'input_ids': input_ids,
          'attention_mask': attention_mask,
          'token_type_ids': token_type_ids,
          'target': target
        }

def create_data_loader(df, tokenizer, max_len, batch_size, shuffle=True):
    ds = CQDataset(
        df['text'].values,
        df['query_prep'].values,
        df['label'].values,
        tokenizer,
        max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2,
        shuffle=shuffle
    )

In [10]:
train_dl = create_data_loader(train, tokenizer, SEQ_LEN, TRAIN_BATCH_SIZE)
val_dl = create_data_loader(val, tokenizer, SEQ_LEN, VAL_BATCH_SIZE, shuffle=False)

In [11]:
class CQClassifier(nn.Module):
    def __init__(self, base_model, units=768):
        super(CQClassifier, self).__init__()
        self.bert = base_model
        self.mlp_head = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(units, 1)
        )
    def forward(self, input_ids, attention_mask, token_type_ids):
        # INPUT
        bert_out = self.bert(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )
        x = bert_out['pooler_output']
        x = self.mlp_head(x).view(-1)
        return x

In [12]:
model = CQClassifier(bert_model, units=768)
model = model.to(device)

print('Total Number of parametres:', sum(p.numel() for p in model.parameters()))
print('Total Nuber of trainable parametres:', sum(p.numel() for p in model.parameters() if p.requires_grad))

Total Number of parametres: 177854209
Total Nuber of trainable parametres: 177854209


In [13]:
EPOCHS = 3
num_train_steps = len(train_dl)*EPOCHS

def configure_optimizers(model, lr):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay_rate": 0.01
        },
        {
            "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay_rate": 0.0
        },]
    optimizer = optim.AdamW(
                optimizer_grouped_parameters,
                lr=lr,
    )
    return optimizer

scaler = GradScaler()
optimizer = configure_optimizers(model, 2e-5)
loss_fn = nn.BCEWithLogitsLoss().to(device)
scheduler = get_linear_schedule_with_warmup(optimizer, int(num_train_steps*.1), num_train_steps)

In [14]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()

    losses = []
    pred = []
    true = []
    
    pbar = tqdm(data_loader)
    for d in pbar:
        optimizer.zero_grad()

        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        token_type_ids = d['token_type_ids'].to(device)
        targets = d["target"].to(device)
        
        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            true += list(targets.cpu().numpy())
            pred += list(torch.round(F.sigmoid(outputs.detach())).cpu().numpy())

            loss = loss_fn(outputs, targets)
            losses.append(loss.detach().item())

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        pbar.set_postfix({'loss': loss.detach().item()})
    
    return np.mean(losses), f1_score(true, pred, average='macro')

def eval_epoch(model, data_loader, loss_fn, device):
    model = model.eval()

    losses = []
    pred = []
    true = []

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            token_type_ids = d['token_type_ids'].to(device)
            targets = d["target"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            true += list(targets.cpu().numpy())
            pred += list(torch.round(F.sigmoid(outputs.detach())).cpu().numpy())

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

    return np.mean(losses), f1_score(true, pred, average='macro')

In [15]:
%%time

print('Number of train steps:', num_train_steps)
for epoch in range(EPOCHS-1):
    print(f'Epoch {epoch + 1}/{EPOCHS}:')
    print('-' * 10)

    train_loss, train_f1 = train_epoch(
        model,
        train_dl,    
        loss_fn, 
        optimizer, 
        device, 
        scheduler
    )
    print(f'    loss: {train_loss:.4f}, f1: {train_f1:.4f} |', end=' ')

    val_loss, val_f1 = eval_epoch(
        model,
        val_dl,
        loss_fn, 
        device
    )
    print(f'val_loss: {val_loss:.4f}, val_f1: {val_f1:.4f}')
    torch.save(model.state_dict(), f'bert_model_{epoch+1}.bin')


Number of train steps: 21232
Epoch 1/2:
----------


HBox(children=(FloatProgress(value=0.0, max=10616.0), HTML(value='')))


    loss: 0.2629, f1: 0.7435 | 

HBox(children=(FloatProgress(value=0.0, max=148.0), HTML(value='')))


val_loss: 0.2534, val_f1: 0.7934
Epoch 2/2:
----------


HBox(children=(FloatProgress(value=0.0, max=10616.0), HTML(value='')))


    loss: 0.1519, f1: 0.8737 | 

HBox(children=(FloatProgress(value=0.0, max=148.0), HTML(value='')))


val_loss: 0.2614, val_f1: 0.8247
CPU times: user 4h 46min 43s, sys: 3h 31min 19s, total: 8h 18min 3s
Wall time: 8h 18min 21s


# Test prediction

In [16]:
test['label'] = 0

In [17]:
def predict(dl, model):
    model = model.eval()
    preds = []
    with torch.no_grad():
        for d in tqdm(dl):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            token_type_ids = d['token_type_ids'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask, token_type_ids=token_type_ids
            )
            outputs = (F.sigmoid(outputs).cpu()).tolist()
            preds += outputs
    return preds

In [18]:
test_dl = create_data_loader(test, tokenizer, SEQ_LEN, VAL_BATCH_SIZE, shuffle=False)
test['pred'] = predict(test_dl, model)

In [20]:
submission = []

for text_idx in tqdm(test['text_id'].unique()):
    pdict = {'idx':int(text_idx)}
    tmp = test[test['text_id'] == text_idx]
    tmp = tmp.iloc[np.argmax(tmp['pred'])]
    pdict['start'] = int(tmp['start'])
    pdict['end'] = int(tmp['end'])
    pdict['text'] = str(tmp['qtext'])

    submission.append(pdict)
submission = [json.dumps(el) for el in submission]
submission = '\n'.join(submission)

HBox(children=(FloatProgress(value=0.0, max=7257.0), HTML(value='')))




In [22]:
with open('submission.jsonl', 'w', encoding='utf-8') as f:
    f.write(submission)