In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration, AdamW

import warnings
warnings.filterwarnings("ignore")

2023-06-21 19:39:53.438083: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
vk = 'big_dev_docs/dev_'

import pandas as pd
import transformers
import evaluate

vk_qrels = pd.read_csv(vk + 'qrels.tsv', names=['id', 'query_id', 'doc_id'],  sep='\t')
vk_docs = pd.read_csv(vk + 'docs.tsv', names=['id', 'doc_id', 'data'],  sep='\t')
vk_queries = pd.read_csv(vk + 'queries.tsv', names=['id', 'query_id', 'data'],  sep='\t')

In [5]:
def create_joined_file(df_docs, df_qrels, df_queries, path_processed_joined=None):
    joined_df = df_qrels.merge(df_queries, on='query_id').merge(df_docs, on='doc_id', how='left')[['query_id', 'data_x', 'doc_id', 'data_y']]
    joined_df.rename(columns={'data_x':'query_data', 'data_y':'doc_data'}, inplace=True)
    if path_processed_joined:
        joined_df.to_csv(path_processed_joined, sep='\t', index=None, header=None)
    return joined_df

vk_joined = create_joined_file(vk_docs, vk_qrels, vk_queries)

In [8]:
vk_joined = vk_joined[['query_data', 'doc_data']]

In [9]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
INPUT_MAX_LEN = 100 # Input length
OUT_MAX_LEN = 128 # Output Length
TRAIN_BATCH_SIZE = 2 # Training Batch Size
VALID_BATCH_SIZE = 2 # Validation Batch Size
EPOCHS = 5 # Number of Iteration

In [10]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length= INPUT_MAX_LEN)
print("eos_token: {} and id: {}".format(tokenizer.eos_token, tokenizer.eos_token_id)) # End of token (eos_token)
print("unk_token: {} and id: {}".format(tokenizer.unk_token, tokenizer.eos_token_id)) # Unknown token (unk_token)
print("pad_token: {} and id: {}".format(tokenizer.pad_token, tokenizer.eos_token_id)) # Pad token (pad_token)

eos_token: </s> and id: 1
unk_token: <unk> and id: 1
pad_token: <pad> and id: 1


In [28]:
from torch.utils.data import Dataset, DataLoader

class T5Dataset(Dataset):
    def __init__(self, documents, queries, transforms=None):
        self.documents = documents
        self.queries = queries
        self.tokenizer = tokenizer
        self.input_max_length = INPUT_MAX_LEN
        self.out_max_length = OUT_MAX_LEN
        self.transforms = transforms
        
        
    def __len__(self):
        return len(self.documents)
    
    def __getitem__(self, idx):
        document = str(self.documents[idx])
        query = str(self.queries[idx])
        
        inputs_encoding = self.tokenizer(
            document, 
            add_special_tokens=True, 
            max_length=self.input_max_length,
            padding='max_length', 
            truncation='only_first',
            return_attention_mask=True,
            return_tensors='pt'
        )
        output_encoding = self.tokenizer(
            query, 
            None, 
            add_special_tokens=True, 
            max_length=self.out_max_length,
            padding='max_length', 
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'        
        )
        inputs_ids = inputs_encoding['input_ids'].flatten()
        attention_mask = inputs_encoding['attention_mask'].flatten()
        labels = output_encoding['input_ids']
        
        labels[labels == 0] = -100
        
        labels = labels.flatten()
        
        out = {
            "document": document,
            "query": query,
            "inputs_ids": inputs_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }
        return out    

In [35]:
dataset = vk_joined[['doc_data', 'query_data']]

dataset = dataset.sample(frac=1).reset_index(drop=True)
train_size = int(len(dataset) * 0.8)

train_dataset = dataset[:train_size]
train_dataset = train_dataset.reset_index(drop=True)
test_dataset = dataset[train_size:]
test_dataset = test_dataset.reset_index(drop=True)

num_workers = 4

train_dataset = T5Dataset(train_dataset.doc_data, train_dataset.query_data)
val_dataset = T5Dataset(test_dataset.doc_data, test_dataset.query_data)
train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, drop_last=True, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, drop_last=False, num_workers=num_workers)

In [36]:
for batch in train_dataloader:
    print(batch.keys())
    print(batch['inputs_ids'], batch['labels'])
    break

dict_keys(['document', 'query', 'inputs_ids', 'attention_mask', 'labels'])
tensor([[    3,     2,  2795,     2, 14142,  5345,  5345,     2,  7948,  6609,
         12095,     3,  2533, 10458,  7948,     2, 27616,     3,  2533, 10458,
          7948,     2, 25873,     2,     3,  6469,  9890,     2,  2044,     2,
         10338,     2,  2795,  6588,     2,  7184,     2,  1757,     3,  2795,
             3,  5814,  2795,  2533,     2, 31585,  2795,     2,  1757, 30610,
          1757,     3, 25157,  5814,  2795,     2, 22420, 30610,  1757,     3,
             2, 22123,  6725,  8452,     2,     3,  6469,  9890,     2,  2044,
             2, 10338,     2,  2795,  6588,     2,  7184,     2,     3, 25157,
          5814,  2795,     2, 22420, 30610,     2,     3,     2, 22123,  6725,
          8452,     3, 21044, 12377,     2,  1757,  6588,     2,     3,     1],
        [    3,     2, 26672,  9592, 14982,  6588,     3,  5814,  6588,     2,
             3,  3700, 20000,  6588, 21302,     2,     

In [37]:
class T5Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        
    def forward(self, input_ids, attention_mask, lables=None):
        output = self.model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                           labels=labels)
        
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["inputs_ids"]
        attention_mask = batch["attention_mask"]
        labels= batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)        
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)
        
       # result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

In [None]:
metric = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels!= - 100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: v * 100 for k, v in result.items()}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result['gen_len'] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}


def train(model, train_dataloader):
#     device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
#     model.to(device)
    model.cuda()
    trainer = Seq2SeqTrainer(
        model, 
        args, 
        train_dataset=training_set,
        eval_dataset=validation_set,
        data_collator = data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()