In [9]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,3,4"
import sys
import torch
import pandas as pd
from torch import nn
import numpy as np
from torch.optim.lr_scheduler import ExponentialLR
import wandb
from datasets import load_dataset, Dataset
from trl import SFTTrainer


In [42]:

sys.path.append("../pipeline_src/")


from config.config import TaskConfig
from train import CustomScheduler, train
from logger.logger import WanDBWriter
from trainer.train_epoch import train_epoch, predict
from dataset.dataset import init_data
from logger.logger import WanDBWriter


if torch.cuda.is_available():
    device = "cuda"
    print("GPU")
else:
    device = "cpu"
    print("CPU")


SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
print(torch.cuda.device_count())

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    LlamaTokenizer,
    LlamaForCausalLM,
    Trainer
)

from peft import LoraConfig, get_peft_model, get_peft_model_state_dict

GPU
3


In [36]:
config = TaskConfig()

config.n_epochs = 2
config.batch_size = 32
config.lr = 3e-4
config.min_lr = 3e-6

config.validation = 1
config.save_every = 1
config.compute_metrics_every = 1

config.data_path = '../babel_datasets/wnet_only/train_ru_babel.pickle'
config.gold_path = (
    None  # "SemEval2018-Task9/training/gold/1A.english.training.gold.txt"
)
config.test_data_path = '../babel_datasets/wnet_only/test_ru_babel.pickle'
config.test_gold_path = (
    None  # "SemEval2018-Task9/test/gold/1A.english.test.gold.txt"
)

config.device = device
config.using_peft = True
config.model_type = "Auto"  # Auto or Llama
config.wandb_log_dir = "/raid/rabikov/wandb/"
config.model_checkpoint = "EleutherAI/gpt-neo-125m"
config.exp_name = config.model_checkpoint.replace("/", "-") + '_test'
config.saving_path = "/raid/rabikov/model_checkpoints/" + config.exp_name

In [113]:
import pandas as pd

import torch
import torch.nn as nn

from tqdm import tqdm_notebook as tqdm

from torch.utils.data import Dataset

from dataset.prompt_schemas import (
    hypo_term_hyper,
    predict_child_from_2_parents,
    predict_child_from_parent,
    predict_child_with_parent_and_grandparent,
    predict_children_with_parent_and_brothers,
    predict_parent_from_child_granparent,
)
import pandas as pd
from multiprocessing import cpu_count
from torch.utils.data import DataLoader


class HypernymDataset(Dataset):
    def __init__(
        self,
        data_path,
        tokenizer,
        tokenizer_encode_args={"return_tensors": "pt"},
        semeval_format=False,
        gold_path=None,
        transforms={
            "only_child_leaf": predict_parent_from_child_granparent,
            "only_leafs_all": predict_child_from_parent,
            "only_leafs_divided": predict_children_with_parent_and_brothers,
            "leafs_and_no_leafs": predict_child_from_parent,
            "simple_triplet_grandparent": predict_parent_from_child_granparent,
            "simple_triplet_2parent": predict_child_from_2_parents,
        },
    ):
        self.tokenizer = tokenizer
        # self.transforms = transforms
        # сюда могут идти немного другие аргументы если допустим я использую Dolly а не T5
        self.tokenizer_encode_args = tokenizer_encode_args
        # в формате SemEval дебильные датасеты, мы их тут соединим
        if semeval_format:
            assert gold_path is not None
            train_data_en_data = pd.read_csv(
                data_path, header=None, sep="\t", names=["term", "relation"]
            )
            train_gold_en_data = pd.read_csv(gold_path, header=None, names=["hypernym"])

            self.df = pd.concat([train_data_en_data, train_gold_en_data], axis=1)[
                ["term", "hypernym"]
            ]
        # предположительно в нашем датасете уже все ок, но это опицональная часть
        else:
            # self.df = pd.read_csv(
            #     data_path, header=None, sep="\t", names=["term", "hypernym"]
            # )

            self.data = pd.read_pickle(data_path)

        # self.df.index = list(range(len(self.df)))

        self.case2transform = transforms

    # в данном случае выход под LM модельку с маск токеном -100
    def __getitem__(self, index):
        # row = self.df.loc[index]
        # term = row["term"]
        # target = ", ".join(row["hypernym"].split("\t"))
        elem = self.data[index]
        case = elem["case"]

        # if not "changed" in elem.keys():
        #     for field in ["children", "parents", "grandparents", "brothers"]:
        #         if field in elem.keys():
        #             elem[field] = HypernymDataset.delete_techniqal(elem[field])
        #             elem["changed"] = True

        # заранее пишу более общо, чтобы мы могли разне процессинги пробовать, а в будущем рандомно выбирать и тд
        # это типа мы подаем список трансформаций затравок
        # processed_term = self.transforms[0](term)
        processed_term, target = self.case2transform[case](elem)

        # токенизируем
        encoded_term = self.tokenizer.encode(
            processed_term, **self.tokenizer_encode_args
        )
        encoded_target = self.tokenizer.encode(
            target, add_special_tokens=False, **self.tokenizer_encode_args
        )

        input_seq = torch.concat([encoded_term, encoded_target], dim=1)
        labels = input_seq.clone()
        labels[0, : encoded_term.size()[1]] = -100

        return {
            "encoded_term": encoded_term.squeeze(),  # думаю потребуется при генерации, или для сек 2 сек
            "encoded_target": encoded_target.squeeze(0),  # отдельно токены для таргета
            "input_seq": input_seq.squeeze(),  # полное предложение без масок
            "labels": labels.squeeze(),  # маскированный контекст
        }

    def __len__(self):
        return len(self.data)

    @staticmethod
    def delete_techniqal(elem):
        if isinstance(elem, str):
            if ".n." in elem:
                return elem.split(".")[0].replace("_", " ")
            else:
                return elem.replace("_", " ")

        elif isinstance(elem, list):
            new_words = []
            for word in elem:
                new_words.append(HypernymDataset.delete_techniqal(word))
            return new_words

    # ничего необычного, складываем, паддим


class Collator:
    def __init__(self, pad_token_id, eos_token_id, mask_token_id):
        self.pad_token_id = pad_token_id
        self.eos_token_id = eos_token_id
        self.mask_token_id = mask_token_id

    def __call__(self, batch):
        terms = []
        targets = []
        inputs = []
        labels = []

        print(batch)
        for elem in batch:
            terms.append(elem["encoded_term"].flip(dims=[0]))
            targets.append(elem["encoded_target"])
            inputs.append(elem["input_seq"])
            labels.append(elem["labels"])

        terms = torch.nn.utils.rnn.pad_sequence(
            terms, batch_first=True, padding_value=self.pad_token_id
        ).flip(dims=[1])
        targets = torch.nn.utils.rnn.pad_sequence(
            targets, batch_first=True, padding_value=self.eos_token_id
        )
        inputs = torch.nn.utils.rnn.pad_sequence(
            inputs, batch_first=True, padding_value=self.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=self.mask_token_id
        )

        att_mask_inputs = torch.zeros_like(inputs)
        att_mask_inputs[inputs != self.pad_token_id] = 1

        att_mask_terms = torch.zeros_like(terms)
        att_mask_terms[terms != self.pad_token_id] = 1

        # return {'terms': terms, 
        #         'att_mask_terms': att_mask_terms,
        #         'targets': targets,
        #         'input_ids': inputs,
        #         'attention_mask': att_mask_inputs,
        #         'labels': labels}

        return {#'terms': terms, 
                #'att_mask_terms': att_mask_terms,
                #'targets': targets,
                'input_ids': inputs,
                'attention_mask': att_mask_inputs,
                'labels': labels}


def init_data(tokenizer, config, mask_label_token=-100, semeval_format=False):
    # data
    train_dataset = HypernymDataset(
        data_path=config.data_path,
        tokenizer=tokenizer,
        gold_path=config.gold_path,
        semeval_format=semeval_format,
    )
    test_dataset = HypernymDataset(
        data_path=config.test_data_path,
        tokenizer=tokenizer,
        gold_path=config.test_gold_path,
        semeval_format=semeval_format,
    )

    num_workers = cpu_count()

    collator = Collator(
        tokenizer.eos_token_id, tokenizer.eos_token_id, mask_label_token
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        collate_fn=collator,
        shuffle=True,
        num_workers=num_workers,
        drop_last=True,
        pin_memory=True,
    )
    val_loader = DataLoader(
        test_dataset,
        batch_size=config.batch_size,
        collate_fn=collator,
        shuffle=False,
        num_workers=num_workers,
        drop_last=False,
        pin_memory=True,
    )

    return train_dataset, test_dataset, train_loader, val_loader


In [114]:
tokenizer = AutoTokenizer.from_pretrained(
    config.model_checkpoint,
    padding_side="left",
)
train_dataset, test_dataset, train_loader, val_loader = init_data(tokenizer, config)

In [74]:
func = model.forward

In [75]:
def superforward(*args, **kwargs):
    return func(*args, **kwargs)

In [77]:
model.extra_forward = model.forward

In [78]:
from functools import partial

In [83]:
lambda x,y: y + x**2

<function __main__.<lambda>(x, y)>

In [85]:
model.forward = lambda self, *args, **kwargs: self.extra_forward(*args, **kwargs)

In [86]:
model.forward

<function __main__.<lambda>(self, *args, **kwargs)>

In [None]:
for batch in train_loader:
    break


batch

In [115]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", device_map='auto')
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

collator = Collator(
        tokenizer.eos_token_id, tokenizer.eos_token_id, -100
    )

In [116]:
from transformers.trainer_utils import (
    PREFIX_CHECKPOINT_DIR,
    BestRun,
    EvalLoopOutput,
    EvalPrediction,
    FSDPOption,
    HPSearchBackend,
    HubStrategy,
    IntervalStrategy,
    PredictionOutput,
    RemoveColumnsCollator,
    ShardedDDPOption,
    TrainerMemoryTracker,
    TrainOutput,
    default_compute_objective,
    denumpify_detensorize,
    enable_full_determinism,
    find_executable_batch_size,
    get_last_checkpoint,
    has_length,
    number_of_arguments,
    seed_worker )

In [117]:


class CustomTrainer(Trainer):

    def get_train_dataloader(self):
        """
        Returns the training [`~torch.utils.data.DataLoader`].

        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
        training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        data_collator = self.data_collator
       # if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
           # train_dataset = self._remove_unused_columns(train_dataset, description="training")
      #  else:
       #     data_collator = self._get_collator_with_removed_columns(data_collator, description="training")

        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }

        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = self._get_train_sampler()
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker

        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))

        

trainer = CustomTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=collator,
    train_dataset=train_dataset, 
  #  eval_dataset=test_dataset
    )

In [118]:
trainer.train_dataset[0]

{'encoded_term': tensor([49229, 12114,  3617,  4948,    25, 12466,   119, 18849,   141,   229,
         22177, 15166, 21727, 20375, 45367,    13,    77,    13,    16,    11,
          5328,  5177,    25, 12466,   111, 30143, 16142, 43666, 18849, 16142,
         20375, 15166, 21169,    13,    77,    13,    16,   930,  8718,  3281,
            76,    25]),
 'encoded_target': tensor([  140,   109, 15166, 16843,   141,   228]),
 'input_seq': tensor([49229, 12114,  3617,  4948,    25, 12466,   119, 18849,   141,   229,
         22177, 15166, 21727, 20375, 45367,    13,    77,    13,    16,    11,
          5328,  5177,    25, 12466,   111, 30143, 16142, 43666, 18849, 16142,
         20375, 15166, 21169,    13,    77,    13,    16,   930,  8718,  3281,
            76,    25,   140,   109, 15166, 16843,   141,   228]),
 'labels': tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
        

In [119]:
trainer.train()



[{'encoded_term': tensor([11085,  8718,  3281,    76,    25, 12466,   115, 16142,   140,   109,
        25443,   119, 16843, 38857, 16142, 22177, 18849, 40623, 12466,   111,
        30143, 16142,   140,   115,    13,    77,    13,    16,    11,  1218,
         8718,  3281,    76,    25, 12466,   112, 16843,   140,   111, 16843,
        22177, 16843, 21169, 16142,   136,   223,   141,   228, 18849, 40623,
           13,    77,    13,    16,   930,  5328, 43612,    25]), 'encoded_target': tensor([43108, 16142, 31583, 35072, 30143, 25443,   112, 18849, 21727, 20375,
        21169, 15166,   141,   226, 18849, 40623]), 'input_seq': tensor([11085,  8718,  3281,    76,    25, 12466,   115, 16142,   140,   109,
        25443,   119, 16843, 38857, 16142, 22177, 18849, 40623, 12466,   111,
        30143, 16142,   140,   115,    13,    77,    13,    16,    11,  1218,
         8718,  3281,    76,    25, 12466,   112, 16843,   140,   111, 16843,
        22177, 16843, 21169, 16142,   136,   223,   1

Step,Training Loss


[{'encoded_term': tensor([49229, 12114,  3617,  4948,    25,   220, 21727, 16843, 30143, 18849,
        20375, 16843,   140,   109, 22177, 45035, 16843, 12466,   115, 16843,
        43108, 30143, 18849,    13,    77,    13,    16,    11,  5328,  5177,
           25, 12466,   123, 21169, 16843, 43666, 43108, 16843, 21727, 20375,
        45367, 16843,    13,    77,    13,    16,   930,  8718,  3281,    76,
           25]), 'encoded_target': tensor([  140,   123, 21169, 18849,   140,   111, 15166, 21169, 25443,   112]), 'input_seq': tensor([49229, 12114,  3617,  4948,    25,   220, 21727, 16843, 30143, 18849,
        20375, 16843,   140,   109, 22177, 45035, 16843, 12466,   115, 16843,
        43108, 30143, 18849,    13,    77,    13,    16,    11,  5328,  5177,
           25, 12466,   123, 21169, 16843, 43666, 43108, 16843, 21727, 20375,
        45367, 16843,    13,    77,    13,    16,   930,  8718,  3281,    76,
           25,   140,   123, 21169, 18849,   140,   111, 15166, 21169, 254

KeyboardInterrupt: 

In [19]:
data = Dataset.from_dict({'question': ['Predict hypernyms for word cat', 'Predict hyponyms for word dog'],
                          'answer': ['animal', 'wild cat']})

In [20]:
data[0]

{'question': 'Predict hypernyms for word cat', 'answer': 'animal'}

In [29]:
def formatting_func(example):
    text = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
    return text

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

trainer = SFTTrainer(
    "facebook/opt-350m",
    train_dataset=data,
    #dataset_text_field="response",
    #packing=True,
    formatting_func=formatting_prompts_func,
    max_seq_length=512,
)


                                                 

In [33]:
trainer.train_dataloader

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2
})

In [31]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=3, training_loss=4.3858642578125, metrics={'train_runtime': 1.5037, 'train_samples_per_second': 3.99, 'train_steps_per_second': 1.995, 'total_flos': 174734180352.0, 'train_loss': 4.3858642578125, 'epoch': 3.0})

In [17]:
trainer.evaluate()



{'eval_loss': 0.5869371891021729,
 'eval_runtime': 0.1766,
 'eval_samples_per_second': 11.327,
 'eval_steps_per_second': 5.664,
 'epoch': 3.0}

In [18]:
trainer.eval_dataset[0]

{'input_ids': [2, 41204], 'attention_mask': [1, 1]}