In [42]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl
%pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'


In [43]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
import mlflow

In [44]:
base_model = "meta-llama/Llama-3.2-1B-Instruct"
dataset = load_dataset("midnightdove-dev/olimpiada", split="train")

In [45]:
from google.colab import userdata

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

access_token = userdata.get("HF_TOKEN")
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    token=access_token,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

OutOfMemoryError: CUDA out of memory. Tried to allocate 502.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 51.06 MiB is free. Process 16888 has 14.70 GiB memory in use. Of the allocated memory 14.48 GiB is allocated by PyTorch, and 85.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, token=access_token, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)
mlflow.set_experiment("MLflow PEFT Tutorial")

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    run_name="llama-3.2-1B-Instruct-olympiads",
    report_to="mlflow",
)
torch.cuda.empty_cache()
import gc
gc.collect()
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="response",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

trainer.train()

In [None]:
new_model="Llama-3.2-1B-Instruct-olympiads"
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

trainer.push_to_hub()

In [None]:
%pip install mlflow
from mlflow.models import infer_signature

sample = dataset[1]

# MLflow infers schema from the provided sample input/output/params
signature = infer_signature(
    model_input=sample["prompt"],
    model_output=sample["response"],
    # Parameters are saved with default values if specified
    params={"max_new_tokens": 256, "repetition_penalty": 1.15, "return_full_text": False},
)

import mlflow

# Get the ID of the MLflow Run that was automatically created above
last_run_id = mlflow.last_active_run().info.run_id

# Save a tokenizer without padding because it is only needed for training
tokenizer_no_pad = AutoTokenizer.from_pretrained(base_model, add_bos_token=True)

# If you interrupt the training, uncomment the following line to stop the MLflow run
# mlflow.end_run()
prompt_template = """Generează un subiect de olimpiadă de {subject}, clasa a {grade}-a, nivel de faza {stage}!"""
with mlflow.start_run(run_id=last_run_id):
    mlflow.log_params(peft_params.to_dict())
    mlflow.transformers.log_model(
        transformers_model={"model": trainer.model, "tokenizer": tokenizer_no_pad},
        signature=signature,
        artifact_path="model",  # This is a relative path to save model files within MLflow run
    )


In [None]:
mlflow_model = mlflow.pyfunc.load_model(f"runs:/{last_run_id}/model")
ans = mlflow_model.predict("Generează un subiect de olimpiadă de Biologie, clasa a 9-a, nivel de faza locala")[0]
ans
