In [1]:
pip install transformers peft datasets accelerate bitsandbytes autoawq



In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

In [3]:
model_name = "Qwen/Qwen1.5-7B-Chat"  # пример модели
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",         # автоматическое распределение слоев
    load_in_4bit=True,         # QLoRA (опционально)
    trust_remote_code=True     # обязательно для Qwen моделей
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 5200
})

In [4]:
# 2. Настройка LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 7,725,518,848 || trainable%: 0.0543


In [9]:
# 3. Загрузка и преобразование Alpaca в формат Qwen
dataset = load_dataset("tatsu-lab/alpaca")  # или свой JSON

def format_alpaca(example):
    # Формируем разговор в стиле Qwen
    user_msg = example['instruction']
    if example['input']:
        user_msg += f"\n{example['input']}"
    assistant_msg = example['output']
    return {
        "text": f"<|user|>\n{user_msg}\n<|assistant|>\n{assistant_msg}"
    }

formatted_dataset = dataset.map(format_alpaca)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [10]:
# 4. Токенизация
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding=True,
        max_length=512,
    )

tokenized_dataset = formatted_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [14]:
# 5. Настройки тренировки
training_args = TrainingArguments(
    output_dir="./qwen-lora-alpaca",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    report_to="none"
)

In [17]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # важно для causal LM
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)

In [19]:
# 6. Обучение
trainer.train()
model.save_pretrained("qwen-lora-finetuned")

Step,Training Loss


KeyboardInterrupt: 