In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import torch

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_PROJECT"] = "llm-sft"

model_identifier = "Qwen2.5-7B-Instruct"
model_name = f"Qwen/{model_identifier}"
data_dir = "data/llm-sft"
models_dir = os.path.join(data_dir, "models", model_identifier)
!modelscope download --model $model_name --local_dir $models_dir

In [2]:
model = AutoModelForCausalLM.from_pretrained(
    models_dir, torch_dtype=torch.bfloat16, device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
model.enable_input_require_grads()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupCosineLR",
        "params": {
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 1,
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto"
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

In [4]:
import torch
from datasets import concatenate_datasets, load_dataset

DEFAULT_SYSTEM_PROMPT = (
    "你是一个机器人助手，你需要尽可能准确、简单的回复用户的所有问题。"
)

batch_size = 1
max_length = 512
fin_dataset = load_dataset("Maciel/FinCUGE-Instruction")
# medi_dataset = load_dataset("SylvanL/Traditional-Chinese-Medicine-Dataset-SFT")

SOURCE_TEMPLATE = "<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n"
TARGET_TEMPLATE = "{assistant}<|endoftext|>"


def infer_seqlen(source_len: int, target_len: int, cutoff_len: int):
    r"""
    Computes the real sequence length after truncation by the cutoff_len.
    """
    if target_len * 2 < cutoff_len:  # truncate source
        max_target_len = cutoff_len
    elif source_len * 2 < cutoff_len:  # truncate target
        max_target_len = cutoff_len - source_len
    else:  # truncate both
        max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))

    new_target_len = min(max_target_len, target_len)
    max_source_len = max(cutoff_len - new_target_len, 0)
    new_source_len = min(max_source_len, source_len)
    return new_source_len, new_target_len


def preprocess_dataset(dataset, all_columns, instruction_col, input_col, output_col):

    def preprocess(examples):
        sources = []
        targets = []

        if instruction_col is not None:
            iter_t = (
                examples[instruction_col],
                examples[input_col],
                examples[output_col],
            )
        else:
            iter_t = examples[input_col], examples[output_col]

        for text in zip(*iter_t):
            if instruction_col is not None:
                instruction, input, output = text
                input = instruction + input
            else:
                input, output = text
            source_message = SOURCE_TEMPLATE.format(
                system=DEFAULT_SYSTEM_PROMPT, user=input
            )
            target_message = TARGET_TEMPLATE.format(assistant=output)
            source = tokenizer(
                source_message,
                return_attention_mask=False,
                add_special_tokens=False,
            )["input_ids"]
            target = tokenizer(
                target_message,
                add_special_tokens=False,
                return_attention_mask=False,
            )["input_ids"]
            source_len, target_len = infer_seqlen(len(source), len(target), max_length)
            source, target = source[:source_len], target[:target_len]
            sources.append(source + target)
            targets.append([-100] * source_len + target)
        return {"input_ids": sources, "labels": targets}

    return dataset.map(
        preprocess,
        batched=True,
        batch_size=batch_size,
        num_proc=16,
        remove_columns=all_columns,
        keep_in_memory=False,
    )


fin_dataset = preprocess_dataset(
    fin_dataset, fin_dataset["train"].column_names, "instruction", "input", "output"
)
# medi_dataset = preprocess_dataset(
#     medi_dataset, medi_dataset["train"].column_names, "instruction", "input", "output"
# )
# medi_dataset = medi_dataset.shuffle(seed=42)
# medi_dataset = medi_dataset["train"].shard(num_shards=100, index=0)
# dataset = concatenate_datasets([fin_dataset["train"], medi_dataset]).shuffle(seed=42)
fin_dataset = fin_dataset["train"].shuffle(seed=42)

In [5]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.05,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 40,370,176 || all params: 7,655,986,688 || trainable%: 0.5273


In [6]:
from transformers import (
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    default_data_collator,
)

train_dataset = fin_dataset

args = TrainingArguments(
    deepspeed="ds_config_zero3.json",
    output_dir=os.path.join(data_dir, "outputs", model_name),
    logging_dir=os.path.join(data_dir, "logs", model_name),
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_only_model=True,
    save_steps=0.2,
    logging_steps=0.01,
    report_to=["wandb"],
    run_name=f"{model_name}-fin",
    bf16=True,
    learning_rate=1e-4,
    gradient_accumulation_steps=8,
    max_grad_norm=1,
    warmup_steps=200,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    dataloader_num_workers=16,
    dataloader_pin_memory=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)
trainer.train()

[2024-10-22 13:43:14,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-10-22 13:43:15,003] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-10-22 13:43:15,004] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


Using /home/nevermore/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/nevermore/.cache/torch_extensions/py310_cu124/fused_adam/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module fused_adam...


ninja: no work to do.
Time to load fused_adam op: 0.055814504623413086 seconds


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msting_nevermore[0m ([33msting_nevermore-personal[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
154,1.8538
308,0.7606
462,0.6861
616,0.6107
770,0.5853
924,0.6126
1078,0.5375
1232,0.6031
1386,0.5591
1540,0.5411


TrainOutput(global_step=15392, training_loss=0.5109334103530757, metrics={'train_runtime': 21238.0598, 'train_samples_per_second': 5.798, 'train_steps_per_second': 0.725, 'total_flos': 7.12519465014657e+17, 'train_loss': 0.5109334103530757, 'epoch': 0.9999918789640807})

In [11]:
from peft import PeftModel

model.save_pretrained(os.path.join(data_dir, "models", f"{model_identifier}-lora"))

In [8]:
# from transformers import AutoModelForCausalLM
# from peft import PeftModel

# model = PeftModel.from_pretrained(
#     model,
#     model_id="/home/nevermore/model_deploy/Qwen-7B/lora",
# )

In [10]:
model.eval()

messages = [
    {
        "role": "system",
        "content": DEFAULT_SYSTEM_PROMPT,
    },
    {
        "role": "user",
        "content": """请评估以下内容的整体情感倾向，包含积极、消极和中性。他妈的出不完的货""",
    },
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt",
    padding=True,
).to(model.device)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=1024,
        top_p=0.8,
        top_k=20,
        temperature=0.9,
        do_sample=True,
    )
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

system
你是一个机器人助手，你需要尽可能准确、简单的回复用户的所有问题。
user
请评估以下内容的整体情感倾向，包含积极、消极和中性。他妈的出不完的货
assistant
所属情感是消极。


In [2]:
classifier_module = __import__("scripts.long_seq_classifier", fromlist=True)

In [3]:
getattr(classifier_module, "get_model", None)("meta-llama/llama-3.2-3B-Instruct", 3)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LongSeqClassifier(
  (base_model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
   