Установка необходимых библиотек

In [None]:
!pip install transformers
!pip install huggingface-hub
!pip install accelerate
!pip install sentencepiece
!pip install sacremoses
!pip install datasets
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install -U accelerate
!pip install peft
!pip install addict

Авторизуемся через Hugging Face, чтобы скачать Llama-2.

In [None]:
import huggingface_hub

token = "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" # your hf token
huggingface_hub.login(token=token)

#huggingface_hub.notebook_login()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Импортируем необходимые модули

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
from datasets import load_dataset, Dataset
from torch.utils.data import IterableDataset
from tqdm import tqdm


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Загрузим базовую модель и её токенизатор

In [None]:
base_model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map='auto', load_in_4bit=True, torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def infer(model, tokenizer, text):
    input_ids = tokenizer(text, return_tensors="pt").input_ids.cuda()
    outputs = model(input_ids=input_ids)

    torch.manual_seed(20)
    generation_output = model.generate(
        input_ids=input_ids,
        max_new_tokens=1024,
        num_beams=1,
        # last_context_length=512,
        do_sample=True,
        temperature=0.95,
        top_p=0.9,
    )
    return tokenizer.decode(generation_output[0])

In [None]:
infer(model, tokenizer, """ User: Are there any flights available for a oneway trip? <SEP> Agent: Where to? Do you have a particular departure date in mind? <SEP> User: I plan to depart from Phoenix, AZ on March 3rd and I'd like to go to Chi-Town.""")

"<s>  User: Are there any flights available for a oneway trip? <SEP> Agent: Where to? Do you have a particular departure date in mind? <SEP> User: I plan to depart from Phoenix, AZ on March 3rd and I'd like to go to Chi-Town. Bedeutung: I am looking for a one-way flight from Phoenix, Arizona to Chicago, Illinois on March 3rd. Can you please check if any flights are available?</s>"

Загрузим датасет для обучения и тестирования

In [None]:
dataset = load_dataset('vidhikatkoria/SGD_Flights', split='train')
dataset = Dataset.from_dict(dataset[0:len(dataset) // 10])

test_dataset = dataset = Dataset.from_dict(dataset[len(dataset) * 2 // 10:len(dataset) *3// 10])

# Дообучение модели

In [None]:
from torch.utils.data import IterableDataset
from tqdm import tqdm
import addict
from transformers import (
    Trainer,
    TrainingArguments,
    logging,
    set_seed)
import os

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model.enable_input_require_grads()
model = get_peft_model(model, lora_config)

In [None]:
def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Context: {example['context']}\n\nAnswer: {example['response']}"
    return text


def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def create_datasets(tokenizer, cfg):
    dataset = load_dataset(
        cfg.dataset_name,
        data_dir=cfg.subset,
        split=cfg.split,
        use_auth_token=True,
        num_proc=cfg.num_workers if not cfg.streaming else None,
        streaming=cfg.streaming,
    )
    if cfg.streaming:
        print("Loading the dataset in streaming mode")
        valid_data = dataset.take(cfg.size_valid_set)
        train_data = dataset.skip(cfg.size_valid_set)
        train_data = train_data.shuffle(buffer_size=cfg.shuffle_buffer, seed=cfg.seed)
    else:
        dataset = dataset.train_test_split(test_size=0.1, seed=cfg.seed)
        train_data = dataset["train"]
        valid_data = dataset["test"]
        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        infinite=True,
        seq_length=cfg.seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        infinite=False,
        seq_length=cfg.seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

In [None]:
class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
            self,
            tokenizer,
            dataset,
            infinite=False,
            seq_length=1024,
            num_of_sequences=1024,
            chars_per_token=3.6,
    ):
        self.tokenizer = tokenizer
        print(tokenizer.eos_token_id)
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else 0
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(prepare_sample_text(next(iterator)))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i: i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

In [None]:
cfg = addict.Dict(
    {
        "dataset_name": "vidhikatkoria/SGD_Flights",
        "max_steps": 200,
        "batch_size": 1,
        "learning_rate": 1e-4,
        "gradient_accumulation_steps": 1,
        "lr_scheduler_type": "cosine",
        "fp16": False,
        "gradient_checkpointing": True,
        "weight_decay": 0.05,
        "num_warmup_steps": 100,
        ##########
        "subset": "",
        "split": "train",
        "size_valid_set": 206,
        "streaming": True,
        "shuffle_buffer": 5000,
        "seq_length": 1024,
        "local_rank": 0,
        "seed": 0,
        "num_workers": None,
        "output_dir": "./checkpoints",
        "log_freq": 1,
        "eval_freq": 10,
        "save_freq": 10,
    }
)

train_dataset, eval_dataset = create_datasets(tokenizer, cfg)

train_dataset.start_iteration = 0

print("Starting main loop")

training_args = TrainingArguments(
    output_dir=cfg.output_dir,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    max_steps=cfg.max_steps,
    eval_steps=cfg.eval_freq,
    save_steps=cfg.save_freq,
    logging_steps=cfg.log_freq,
    per_device_train_batch_size=cfg.batch_size,
    per_device_eval_batch_size=cfg.batch_size,
    learning_rate=cfg.learning_rate,
    lr_scheduler_type=cfg.lr_scheduler_type,
    warmup_steps=cfg.num_warmup_steps,
    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
    gradient_checkpointing=cfg.gradient_checkpointing,
    fp16=cfg.fp16,
    weight_decay=cfg.weight_decay,
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset)

print("Training...")
trainer.train()

print("Saving last checkpoint of the model")
model.save_pretrained(os.path.join(cfg.output_dir, "final_checkpoint/"))



Loading the dataset in streaming mode


100%|██████████| 400/400 [00:01<00:00, 368.45it/s]


The character to token ratio of the dataset is: 3.39
2
2
Starting main loop
Training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
10,2.3148,2.331397
20,2.0886,2.195784
30,1.7882,1.928825
40,1.801,1.779259
50,1.8249,1.705979
60,1.6494,1.626067
70,1.4977,1.52731
80,1.3812,1.411614
90,1.4316,1.347975
100,1.412,1.298818




Saving last checkpoint of the model


# Тестирование дообученной модели

In [None]:
new_model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16)

tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf')
new_model = PeftModel.from_pretrained(new_model, "./checkpoints/final_checkpoint", torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
infer(new_model, tokenizer, """ User: Are there any flights available for a oneway trip? <SEP> Agent: Where to? Do you have a particular departure date in mind? <SEP> User: I plan to depart from Phoenix, AZ on March 3rd and I'd like to go to Chi-Town.""")

"<s>  User: Are there any flights available for a oneway trip? <SEP> Agent: Where to? Do you have a particular departure date in mind? <SEP> User: I plan to depart from Phoenix, AZ on March 3rd and I'd like to go to Chi-Town. Context: Searching for economy class flights.\n\nAnswer: Yes, here's an American Airlines flight from Phoenix to Chicago, which departs on the 3rd of March and has 0 layovers. It costs $162.</s>"

In [None]:
def count_perplexity(model, tokenizer, test_dataset):
    encodings = tokenizer("\n\n".join(test_dataset["context"]), return_tensors="pt")
    max_length = 4096 // 4
    stride = 512 // 2
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to('cuda')
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
        torch.cuda.empty_cache()

    return torch.exp(torch.stack(nlls).mean())

In [None]:
print(f'Perplexity у исходной модели: {count_perplexity(model, tokenizer, test_dataset)}')

 92%|█████████▏| 48/52 [03:32<00:17,  4.43s/it]


Perplexity у исходной модели: 5.207478046417236


In [None]:
print(f'Perplexity у дообученной модели: {count_perplexity(new_model, tokenizer, test_dataset)}')

 92%|█████████▏| 48/52 [03:33<00:17,  4.45s/it]


Perplexity у дообученной модели: 3.535085439682007


### Возможный код для перевода датасета(не используется)

In [None]:
# import transformers

# src = "en"
# dst = "ru"

# task_name = f"translation_{src}_to_{dst}"
# model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"

# translator  = transformers.pipeline(task_name, model=model_name, tokenizer=model_name, device='cuda')

# def translate(example):
#     example['response'] = translator(example['response'])[0]["translation_text"]
#     return example

# dataset = dataset.map(translate)

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

'Пользователь: Да, это будет здорово. <SEP> Агент: Пожалуйста, подтвердите вашу бронирование на одного пассажира на рейсе Economy Delta Airlines из Лос-Анджелеса в Нью-Йорк. Этот рейс отправится в следующий понедельник в 13:40 и вернется 14 марта в 7:30. <SEP>'