<a href="https://colab.research.google.com/github/WenboKou/10000hours/blob/main/llm_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!wget -O samples.jsonl https://raw.githubusercontent.com/WenboKou/LoRA/main/samples.jsonl

--2024-08-16 05:08:45--  https://raw.githubusercontent.com/WenboKou/LoRA/main/samples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 249809 (244K) [text/plain]
Saving to: ‘samples.jsonl’


2024-08-16 05:08:45 (10.9 MB/s) - ‘samples.jsonl’ saved [249809/249809]



In [3]:
!pip install transformers==4.38.2
!pip install peft

Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/130.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.2)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizer

In [4]:
import json
from dataclasses import dataclass, field
from typing import Dict, Optional, List

import torch
import transformers
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, Trainer
from transformers.trainer_pt_utils import LabelSmoother

In [5]:
@dataclass
class DataArguments:
    train_data_path: str = field(default=None)
    eval_data_path: str = field(default=None)


@dataclass
class ModelArguments:
    model_name_or_path: str = field(default="Qwen/Qwen2-0.5B")


@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=64,
        metadata={
            "help": "Maximum sequence length."
        }
    )
    use_lora: bool = False


@dataclass
class LoraArguments:
    lora_r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    lora_target_modules: List[str] = field(
        default_factory=lambda: [
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "up_proj",
            "gate_proj",
            "down_proj"
        ]
    )
    lora_weight_path: str = ""
    lora_bias: str = "none"
    q_lora: bool = False

In [6]:
def preprocess(
        messages,
        tokenizer: transformers.PreTrainedTokenizer,
        max_len: int,
) -> Dict:
    """Preprocesses the data for supervised fine-tuning."""

    texts = []
    for msg in messages:
        texts.append(
            tokenizer.apply_chat_template(
                msg,
                tokenize=True,
                truncation=True,
                padding=True,
                max_length=max_len,
            )
        )
    input_ids = torch.tensor(texts, dtype=torch.int)
    target_ids = input_ids.clone()
    target_ids[target_ids == tokenizer.pad_token_id] = LabelSmoother.ignore_index
    attention_mask = input_ids.ne(tokenizer.pad_token_id)

    return dict(
        input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask
    )


class SupervisedDataset(Dataset):
    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
        super().__init__()

        messages = [sample["messages"] for sample in raw_data]
        data_dict = preprocess(messages, tokenizer, max_len)

        self.input_ids = data_dict["input_ids"]
        self.target_ids = data_dict["target_ids"]
        self.attention_mask = data_dict["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(
            input_ids=self.input_ids[i],
            labels=self.target_ids[i],
            attention_mask=self.attention_mask[i]
        )


def make_supervised_data_module(
        tokenizer: transformers.PreTrainedTokenizer,
        data_args: str,
        max_len: int
) -> Dict:
    """make dataset and collator for supervised fine-tuning."""

    train_data = []
    with open(data_args.train_data_path, 'r', encoding='utf-8') as file:
        for line in file:
            train_data.append(json.loads(line))

    eval_data = []
    if data_args.eval_data_path:
        with open(data_args.eval_data_path, 'r', encoding='utf-8') as file:
            for line in file:
                eval_data.append(json.loads(line))

    train_dataset = SupervisedDataset(train_data, tokenizer, max_len)
    eval_dataset = SupervisedDataset(eval_data, tokenizer, max_len) if eval_data else None
    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)

In [7]:
training_args = TrainingArguments(
    use_lora=True,
    output_dir="output_qwen",
    bf16=True,
    learning_rate=3e-4,
    weight_decay=0.01,
    adam_beta2=0.95,
    warmup_ratio=0.01,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    logging_steps=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=10,
    report_to="none"
)
model_args = ModelArguments()
lora_args = LoraArguments()
data_args = DataArguments(train_data_path="samples.jsonl")

compute_dtype = (
    torch.float16
    if training_args.fp16
    else (torch.bfloat16 if training_args.bf16 else torch.float32)
)

config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=training_args.cache_dir)
config.use_cache = False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    config=config,
    cache_dir=training_args.cache_dir,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype
    ) if training_args.use_lora and lora_args.q_lora
    else None,
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=False
)

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
if training_args.use_lora:
    print("start training")
    lora_config = LoraConfig(
        r=lora_args.lora_r,
        lora_alpha=lora_args.lora_alpha,
        target_modules=lora_args.lora_target_modules,
        lora_dropout=lora_args.lora_dropout,
        bias=lora_args.lora_bias,
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, lora_config)

    model.print_trainable_parameters()

    data_module = make_supervised_data_module(tokenizer, data_args, training_args.model_max_length)

    trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)

    trainer.train()

    trainer.save_state()

start training
trainable params: 35,192,832 || all params: 529,225,600 || trainable%: 6.6499


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
1,2.6569
2,2.5945
3,1.8546
4,1.4381
5,1.0774
6,0.8776
7,0.7034
8,0.6915
9,0.5621
10,0.5568


