In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import torch

os.environ["WANDB_PROJECT"] = "gemma-introduce"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
data_dir = "data/gemma-introduce"

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b", device_map="cuda", torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")

In [None]:
model

In [4]:
from datasets import load_dataset

ds = load_dataset("ttxy/sentiment")

classes = {0: "负面", 1: "正面"}
ds = ds.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True,
    num_proc=1,
)

In [5]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = 3

In [6]:
import torch

max_length = 64


def preprocess_function(examples, text_column="text", label_column="text_label"):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (
            max_length - len(sample_input_ids)
        ) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [-100] * (
            max_length - len(label_input_ids)
        ) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][:max_length]
        )
        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][:max_length]
        )
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
processed_ds = ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    # modules_to_save=["lm_head"],
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
from transformers import Trainer, TrainingArguments, default_data_collator

train_ds = processed_ds["train"]
eval_ds = processed_ds["validation"]

lr = 1e-5
num_epochs = 3
batch_size = 32

args = TrainingArguments(
    run_name="gemma-introduce",
    output_dir=os.path.join(data_dir, "output"),
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy="steps",
    eval_steps=0.1,
    bf16=True,
    learning_rate=lr,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    report_to=["wandb"],
    logging_dir=os.path.join(data_dir, "logs"),
    logging_steps=0.02,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)
trainer.train()

In [18]:
model.save_pretrained(os.path.join(data_dir, "gemma2-2b-lora"))

In [None]:
from peft import PeftConfig, PeftModelForCausalLM

config = PeftConfig.from_pretrained(os.path.join(data_dir, "gemma2-2b-lora"))
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModelForCausalLM.from_pretrained(
    model, os.path.join(data_dir, "gemma2-2b-lora"), device_map="cuda"
)

In [7]:
inputs = tokenizer(
    f"text : 我草，真难吃 Label : ",
    return_tensors="pt",
    padding="max_length",
    max_length=64,
    truncation=True,
)

In [None]:
inputs

In [23]:
model.eval()
# model.to(torch.device("cuda"))
with torch.no_grad():
    intpus = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits

In [None]:
tokenizer.batch_decode(logits[..., :-1, :].contiguous().view(-1, tokenizer.vocab_size))

In [10]:
from datasets import load_dataset

ds = load_dataset("hfl/stem_zh_instruction")

In [56]:
from itertools import chain
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")


def tokenize_function(examples):
    output = tokenizer(examples["text"])
    return output


block_size = tokenizer.model_max_length


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    print(concatenated_examples)
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


ds = load_dataset("text", data_files="data/ruozhiba_qa2449_gpt4o.json")
ds = ds.map(
    tokenize_function, batched=True, num_proc=1, remove_columns="text", batch_size=2
)
ds = ds.map(
    group_texts,
    batched=True,
    batch_size=2,
    num_proc=1,
)

Map:   0%|          | 0/12247 [00:00<?, ? examples/s]

{'input_ids': [128000, 58, 128000, 220, 314], 'attention_mask': [1, 1, 1, 1, 1]}
{'input_ids': [128000, 262, 330, 56074, 794, 330, 123226, 119283, 21043, 123226, 122374, 106189, 19483, 34048, 25129, 102856, 11571, 498, 128000, 262, 330, 1379, 794, 7492], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [128000, 262, 330, 3081, 794, 330, 123226, 119283, 126006, 64026, 103668, 64467, 123226, 54656, 121, 22238, 124434, 109683, 50338, 21403, 244, 34048, 25129, 104122, 21043, 33764, 123226, 111140, 107079, 108966, 21043, 105494, 123226, 70349, 120044, 107079, 72917, 102769, 111017, 5486, 164, 73609, 44416, 34208, 117041, 34547, 115376, 105067, 87502, 107079, 40089, 119008, 1811, 103282, 40526, 99750, 105986, 101921, 119, 3922, 105469, 19361, 115039, 34171, 3922, 40053, 90070, 35304, 102395, 108042, 5486, 101519, 97, 107079, 34208, 102208, 22238, 31809, 105271, 16325, 1811, 1, 128000, 220, 2529], 'attention_mask': [1, 1, 1, 1, 1, 1, 1,

In [49]:
for i, inputs in enumerate(ds["train"]):
    concatenated_examples = {k: list(chain(*inputs[k])) for k in inputs.keys()}
    if i > 10:
        break

TypeError: 'int' object is not iterable

In [58]:
a = [1, 2]
a[0:3]

[1, 2]

In [62]:
tokenizer

PreTrainedTokenizerFast(name_or_path='meta-llama/Llama-3.2-3B', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|reserved_special_token_2|>", rst

In [4]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

In [13]:
tokenizer.vocab["<|endoftext|>"]

151643