In [1]:
import json
import logging

import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "./recipe-bot-finetuned"
DATASET_FILE = "recipes_training_final.json"

In [None]:
def load_and_prepare_data(file_path):
    with open(file_path, encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded {len(data)} training examples")

    print("Sample training example:")
    sample = data[0]["text"]
    print(sample[:300] + "...")

    return Dataset.from_dict({"text": [item["text"] for item in data]})

In [None]:
def create_qlora_config():
    """Create QLoRA configuration"""
    return LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

In [None]:
def tokenize_function(examples, tokenizer, max_length=768):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors=None,
    )

    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Tokenizer loaded")

INFO:__main__:
1. Loading tokenizer from TinyLlama/TinyLlama-1.1B-Chat-v1.0...


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    device_map="cpu",
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)
model.config.use_cache = False
print("Base model loaded")

INFO:__main__:
2. Loading model from TinyLlama/TinyLlama-1.1B-Chat-v1.0...


In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

INFO:__main__:
3. Preparing model for QLoRA training...


In [None]:
lora_config = create_qlora_config()
model = get_peft_model(model, lora_config)

INFO:__main__:
4. Applying LoRA configuration...


In [None]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print("Model Statistics:")
print(f"Trainable params: {trainable_params:,} ({trainable_params / 1e6:.2f}M)")
print(f"Total params: {total_params:,} ({total_params / 1e9:.2f}B)")
print(f"Trainable: {100 * trainable_params / total_params:.2f}%")

INFO:__main__:
Trainable parameters: 2,252,800
INFO:__main__:Total parameters: 1,102,301,184
INFO:__main__:Trainable %: 0.20%


In [None]:
dataset = load_and_prepare_data(DATASET_FILE)

INFO:__main__:
5. Loading dataset from recipes_training.json...
INFO:__main__:Loaded 20 training examples


In [None]:
tokenized_dataset = dataset.map(
    lambda x: tokenize_function(x, tokenizer),
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing",
)
print(f"Tokenized {len(tokenized_dataset)} examples")

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=3e-4,
    fp16=False,
    logging_steps=5,
    save_steps=100,
    save_total_limit=2,
    warmup_steps=20,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    optim="adamw_torch",
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    logging_first_step=True,
    save_safetensors=True,
)

INFO:__main__:
7. Setting up training arguments...


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

INFO:__main__:
8. Initializing trainer...


In [16]:
trainer.train()

  0%|          | 0/15 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
 67%|██████▋   | 10/15 [48:24<23:51, 286.38s/it] 

{'loss': 1.398, 'grad_norm': 0.7914060354232788, 'learning_rate': 0.0002, 'epoch': 2.0}


100%|██████████| 15/15 [1:43:24<00:00, 413.61s/it]

{'train_runtime': 6204.0783, 'train_samples_per_second': 0.01, 'train_steps_per_second': 0.002, 'train_loss': 1.3177302996317546, 'epoch': 3.0}





TrainOutput(global_step=15, training_loss=1.3177302996317546, metrics={'train_runtime': 6204.0783, 'train_samples_per_second': 0.01, 'train_steps_per_second': 0.002, 'train_loss': 1.3177302996317546, 'epoch': 3.0})

In [18]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

('./recipe-bot-finetuned-v2\\tokenizer_config.json',
 './recipe-bot-finetuned-v2\\special_tokens_map.json',
 './recipe-bot-finetuned-v2\\tokenizer.model',
 './recipe-bot-finetuned-v2\\added_tokens.json',
 './recipe-bot-finetuned-v2\\tokenizer.json')