In [None]:
%pip install transformers tqdm torch datasets wandb transformers[torch] accelerate

In [None]:
# %pip freeze --local | grep -v '^\-e' | cut -d = -f 1  | xargs -n1 pip install -U

In [None]:
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
from transformers import GPT2Tokenizer, T5ForConditionalGeneration
from transformers import get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

from tqdm.auto import tqdm

import wandb

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset

from torch.amp import autocast

In [None]:
def load_embs(dataset_path, tokenizer, max_size = -1):
    embs = []
    count = 0
    
    train_dataset = load_dataset(dataset_path, split="train", streaming=True)
    # removed_duplicates_dataset = datasets.Dataset.from_pandas(pd.DataFrame(train_dataset).drop_duplicates())
    
    # max_size = min(max_size, len(removed_duplicates_dataset)) if max_size != -1 else len(removed_duplicates_dataset)
    
    for data in tqdm((train_dataset)):
        # train_row = f"<LM>{tokenizer.bos_token}<ins> {instruction} {input} </ins>{output}{tokenizer.eos_token}"
        prompt = "<LM>"+data["system_prompt"]+" "+data["question"]
        response = data["response"]
        
        input_ids = tokenizer.encode(prompt, add_special_tokens=False, truncation=True, max_length=1024)
        output_ids = tokenizer.encode(response, add_special_tokens=False)
        
        if len(input_ids) < 768 and len(output_ids) < 768:
            embs.append({"input_ids": input_ids, "output_ids": output_ids})
            count+=1
        if count == max_size:
            break
    
    return embs
    

In [None]:
class OrcaDataset(Dataset):
    def __init__(self, tokenizer, embs):
        self._data = []
        self.tokenizer = tokenizer
        self.max_input_len = 0
        self.max_output_len = 0
        
        self.bos_token_id = tokenizer.bos_token_id
        self.eos_token_id = tokenizer.eos_token_id
        self.pad_token_id = tokenizer.pad_token_id
        
        for emb in embs:
            input_ids = emb["input_ids"]
            output_ids = emb["output_ids"] + [self.eos_token_id]
            self._data.append((input_ids, output_ids))
            self.max_input_len = max(self.max_input_len, len(input_ids))
            self.max_output_len = max(self.max_output_len, len(output_ids))
         
    def __len__(self):
        return len(self._data)

    def __getitem__(self, item: int):
        input_ids, output_ids = self._data[item]

        input_npad = self.max_input_len - len(input_ids)
        attention_mask = [1]*len(input_ids) + [0]*input_npad
        input_ids = input_ids + input_npad * [self.pad_token_id]

        output_npad = self.max_output_len - len(output_ids)
        labels = output_ids + output_npad * [-100]

        return {'input_ids': torch.LongTensor(input_ids),
                'attention_mask': attention_mask,
                'labels': torch.LongTensor(labels),
                }

In [None]:
checkpoint = 'ai-forever/FRED-T5-1.7B'
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint, eos_token='</s>')
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
# device='cuda:0' if torch.cuda.is_available() else "cpu"
# model.to(device)

In [None]:
# tokenizer.add_special_tokens({"additional_special_tokens": ["<ins>", "</ins>"]})
# model.resize_token_embeddings(len(tokenizer))

In [None]:
dataset_path = "d0rj/OpenOrca-ru"
loaded_data = load_embs(dataset_path, tokenizer, 32)

orca_dataset = OrcaDataset(tokenizer, loaded_data)

In [None]:
orca_dataset

In [None]:
wandb.login(key="1744a8a36a11513989635282773e3dc518b05f1c", relogin=True)
wandb.init(sync_tensorboard=True, name='train', project="hse-project", entity="aid_")

In [None]:
batch_size = 1
num_training_steps = len(orca_dataset)//batch_size
    
training_args = TrainingArguments(
    output_dir="./FRED-T5-tune",
    learning_rate=1e-3,
    per_device_train_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.05,
    optim='adamw_hf',
    lr_scheduler_type="linear",
    warmup_steps=1_000,
    report_to="wandb",
    run_name="train",
    gradient_accumulation_steps=10
    # use_cpu=True
)

# model.to(training_args.device)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=orca_dataset,
    tokenizer=tokenizer,
    args=training_args,
)

trainer.train()