In [1]:
import os
import pandas as pd
import json
from tqdm.auto import tqdm
tqdm.pandas()
from transformers import AutoModel, AutoTokenizer
import torch
from torch.utils.data import DataLoader
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import DataCollatorWithPadding
from scipy.stats import pearsonr, spearmanr
import math
from sklearn.metrics import *



In [None]:
AUTH_TOKEN = "insert_your_huggingface_token"

In [3]:
tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token=AUTH_TOKEN)
print(tokenizer.decode(tokenizer.encode("sinh viên đại học bách khoa hà nội")))

<s> sinh viên đại học bách khoa hà nội</s>


In [4]:
import json 
from glob import glob 
import re 
from nltk import word_tokenize as lib_tokenizer 
 
dict_map = dict({}) 
 
def word_tokenize(text): 
    global dict_map 
    words = text.split() 
    words_norm = [] 
    for w in words: 
        if dict_map.get(w, None) is None: 
            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"') 
        words_norm.append(dict_map[w]) 
    return words_norm 
 
def strip_answer_string(text): 
    text = text.strip() 
    while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`': 
        if text[0] != '(' and text[-1] == ')' and '(' in text: 
            break 
        if text[-1] == '"' and text[0] != '"' and text.count('"') > 1: 
            break 
        text = text[:-1].strip() 
    while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`': 
        if text[0] == '"' and text[-1] != '"' and text.count('"') > 1: 
            break 
        text = text[1:].strip() 
    text = text.strip() 
    return text 
 
def strip_context(text): 
    text = text.replace('\n', ' ') 
    text = re.sub(r'\s+', ' ', text) 
    text = text.strip() 
    return text

In [5]:
df1 = pd.read_csv("./processed/train_stage1_ranking.csv")
df1.text = df1.text.apply(lambda x: " ".join(word_tokenize(strip_context(x))))
df1.question = df1.question.apply(lambda x: " ".join(word_tokenize(strip_context(x))))
df = df1

In [7]:
import torch.nn as nn
from transformers import AutoModel, AutoConfig

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class PairwiseModel(nn.Module):
    def __init__(self, model_name):
        super(PairwiseModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)
        self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, 1)
        
    def forward(self, ids, masks):
        out = self.model(input_ids=ids,
                           attention_mask=masks,
                           output_hidden_states=False).last_hidden_state
        out = out[:,0]
        outputs = self.fc(out)
        return outputs


In [8]:
from torch.utils.data import Dataset

class SiameseDataset(Dataset):

    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.content1 = tokenizer.batch_encode_plus(list(df.question.apply(lambda x: x.replace("_"," ")).values), max_length=max_length, truncation=True)["input_ids"]
        self.content2 = tokenizer.batch_encode_plus(list(df.text.apply(lambda x: x.replace("_"," ")).values), max_length=max_length, truncation=True)["input_ids"]
        self.targets = self.df.label
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return {
            'ids1': torch.tensor(self.content1[index], dtype=torch.long),
            'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),
            'target': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
pad_token_id = tokenizer.pad_token_id
def collate_fn(batch):
    ids = [torch.cat([x["ids1"], x["ids2"]]) for x in batch]
    targets = [x["target"] for x in batch]
    max_len = np.max([len(x) for x in ids])
    masks = []
    for i in range(len(ids)):
        if len(ids[i]) < max_len:
            ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))
        masks.append(ids[i] != pad_token_id)
    # print(tokenizer.decode(ids[0]))
    outputs = {
        "ids": torch.vstack(ids),
        "masks": torch.vstack(masks),
        "target": torch.vstack(targets).view(-1)
    }
    return outputs

In [10]:
from sklearn.model_selection import GroupKFold, KFold

In [11]:
def optimizer_scheduler(model, num_train_steps):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

    opt = AdamW(optimizer_parameters, lr=3e-5)
    sch = get_linear_schedule_with_warmup(
        opt,
        num_warmup_steps=int(0.05*num_train_steps),
        num_training_steps=num_train_steps,
        last_epoch=-1,
    )
    return opt, sch

In [12]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
from tqdm.auto import tqdm
loss_fn = nn.BCEWithLogitsLoss()
epochs = 5
accumulation_steps = 8
scaler = torch.cuda.amp.GradScaler()
error_ids = None
for fold, (train_index, test_index) in enumerate(kfold.split(df, df.label)):
    if fold != 0:
        break
    print(test_index)
    model = PairwiseModel('nguyenvulebinh/vi-mrc-base')
    # model.load_state_dict(torch.load(f"./outputs/pairwise_v2.bin"))
    model.cuda()
    train_df = df
    # train_df = df.iloc[train_index].reset_index(drop=True)
    val_df = df.iloc[test_index].reset_index(drop=True)
    
    train_dataset = SiameseDataset(train_df, tokenizer, 384)
    valid_dataset = SiameseDataset(val_df, tokenizer, 384)
    train_loader = DataLoader(train_dataset, batch_size=4, collate_fn=collate_fn,
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn,
                              num_workers=2, shuffle=False, pin_memory=True)
    
    num_train_steps = len(train_loader) * epochs // accumulation_steps
    optimizer, scheduler = optimizer_scheduler(model, num_train_steps)
    
    for epoch in tqdm(range(epochs)):
        model.train()
        bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
        for step, data in bar:
            ids = data["ids"].cuda()
            # for x in ids:
            #     print(tokenizer.decode(x))
            masks = data["masks"].cuda()
            target = data["target"].cuda()
            # with torch.cuda.amp.autocast():
            preds = model(ids, masks)
            # print(preds.view(-1))
            loss = loss_fn(preds.view(-1), target.view(-1))
            loss /= accumulation_steps
            loss.backward()
            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                # scaler.update()
                optimizer.zero_grad()
                scheduler.step()
            bar.set_postfix(loss=loss.item())

        model.eval()
        with torch.no_grad():
            bar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)
            targets = []
            all_preds = []
            for step, data in bar:
                ids = data["ids"].cuda()
                masks = data["masks"].cuda()
                target = data["target"].cuda()
                preds = torch.sigmoid(model(ids, masks))
                all_preds.extend(preds.cpu().view(-1).numpy())
                targets.extend(target.cpu().view(-1).numpy())
            all_preds = np.array(all_preds)
            targets = np.array(targets)
            print(f"F1 {f1_score(targets, all_preds > 0.5)}")

[    0     3     6 ... 20096 20102 20105]


Some weights of the model checkpoint at nguyenvulebinh/vi-mrc-base were not used when initializing RobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5026 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

F1 0.9051274178692048


  0%|          | 0/5026 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

F1 0.9673033344124311


  0%|          | 0/5026 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

F1 0.9820051413881747


  0%|          | 0/5026 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

F1 0.990593577684074


  0%|          | 0/5026 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

F1 0.9928664072632944


In [14]:
print(f"F1 {recall_score(np.array(targets), np.array(all_preds) > 0.5)}")

F1 0.9948018193632229


In [16]:
torch.save(model.state_dict(), f"./outputs/pairwise_v2.bin")