In [1]:
import transformers
import textwrap
from transformers import LlamaTokenizer, LlamaForCausalLM
import os
import sys
from typing import List

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    # prepare_model_for_int8_training,
)

import fire
import torch
from datasets import load_dataset, Dataset
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from pylab import rcParams
import json

%matplotlib inline
sns.set(rc={'figure.figsize':(8, 6)})
sns.set(rc={'figure.dpi':100})
sns.set(style='white', palette='muted', font_scale=1.2)


In [2]:
CUTOFF_LEN = 10

In [3]:
def generate_prompt(data_point):    
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""

In [4]:
from peft import PeftModel

BASE_MODEL = "decapoda-research/llama-7b-hf"
# finetuned = "mmosiolek/polpaca-lora-7b"

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    # load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="cuda",
    offload_folder="offload"
)

# model = PeftModel.from_pretrained(model, finetuned).to("cuda")
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [6]:
from src.data_processing import load_raw_fairytales_dataset

raw_fairytales = load_raw_fairytales_dataset(100)

Reading files: 2124 files [00:00, 3868.85 files/s]


In [7]:
n_less_fairytales = []
for tale in raw_fairytales:
    tale = tale.replace('\n\n', ' ')
    tale = tale.replace('\n', ' ')
    n_less_fairytales.append(tale)
n_less_fairytales[0].split('. ')

['Dawno, dawno temu, w małej wiosce zwanej Doliną Radosnych Mieszkańców, żył pewien dzielny chłopiec o imieniu Bolek',
 'Był on mały, ale pełen odwagi i fantazji',
 'Każdego dnia Bolek wyruszał na poszukiwanie przygód, które czekały na niego za żywopłotem, na granicy wioski',
 'Pewnego pięknego poranka, Bolek postanowił zwiedzić pobliski las',
 'Spakował swoje ukochane kanapki z masłem orzechowym, napełnił butelkę wodą i ruszył w drogę',
 'Kiedy dotarł do lasu, zobaczył tajemniczą ścieżkę, którą jeszcze żaden mieszkaniec Doliny nie śmiał podążać',
 'Oczywiście, Bolek postanowił, że to jest dokładnie to, czego potrzebuje! Wraz ze swoim małym plecakiem na plecach, Bolek wkroczył na nieznane terytorium',
 'Las był pełen szemrzących drzew i śpiewających ptaków',
 'Bolek stał zachwycony, kiedy zebrał kilka pięknych kolorowych kwiatów, które później zaniesie swojej mamie',
 'Wszystko wydawało się urokliwe, aż do momentu, kiedy usłyszał głośne trzaskanie gałęzi',
 'Bolek podążył za źródłem dź

In [8]:
dataset_list = []
for tale in n_less_fairytales:
    tale = tale.split('. ')
    for sentence in tale:
        dataset_list.append({"instruction": "Napisz bajkę",
                             "input": '',
                             "output": sentence
                             })

dataset = Dataset.from_list(dataset_list)

In [9]:
train_val = dataset.train_test_split(
    test_size=200, shuffle=True, seed=42
)
train_data = (
    train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].shuffle().map(generate_and_tokenize_prompt)
)

Map:   0%|          | 0/3123 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [10]:
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

BATCH_SIZE = 16
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 300
OUTPUT_DIR = "experiments"

In [11]:
# model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


In [12]:
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    max_steps=TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=50,
    save_steps=50,
    output_dir=OUTPUT_DIR,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard" 
)

In [13]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [14]:
train_data.column_names

['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels']

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator,
)
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

# model = torch.compile(model)

trainer.train()
model.save_pretrained(OUTPUT_DIR)

Step,Training Loss,Validation Loss


In [None]:
set(model.hf_device_map.values())

In [None]:
devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
if len(devices) > 1:
    print(f"is_model_parallel: True")
else:
    print(f"is_model_parallel:self.args.device!= {torch.device(devices[0])}")
