In [1]:
#bitsandbytes guide: https://huggingface.co/docs/bitsandbytes/main/en/installation
!pip install bitsandbytes
!pip install trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.23.0


In [2]:
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch
# Import BitsAndBytesConfig for quantization
from transformers import BitsAndBytesConfig
from datasets import load_dataset
#google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


adapted from: https://colab.research.google.com/github/huggingface/notebooks/blob/main/course/en/chapter11/section4.ipynb#scrollTo=FLGpCGYrf-kM

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M"

# Configure BitsAndBytes for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    quantization_config=bnb_config, # Add the quantization config
    device_map="auto", # Automatically map the model to available devices
)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up the chat format
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

#save location
finetune_name = "/content/drive/MyDrive/!personalMLProject/rag_llm_finetune/SmolLM2-135M-Rust-FT"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

In [4]:
# Load the dataset
dataset = load_dataset("json", data_files="/content/rust_qa_dataset_5k.jsonl")

# Format the dataset for fine-tuning
def format_dataset(example):
    # Assuming each example has 'question' and 'answer' keys
    # You might need to adjust this based on your specific data structure
    return {
        "text": f"### Question:\n{example['question']}\n\n### Answer:\n{example['answer']}"
    }

dataset = dataset.map(format_dataset)

# Split the dataset into training and evaluation sets (optional)
dataset = dataset["train"].train_test_split(test_size=0.25)

display(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 3750
    })
    test: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 1250
    })
})

In [5]:
from peft import LoraConfig

# TODO: Configure LoRA parameters
# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 6
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 8
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05

peft_config = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    target_modules="all-linear",  # Which modules to apply LoRA to
    task_type="CAUSAL_LM",  # Task type for model architecture
)

In [6]:
# Training configuration
# Hyperparameters based on QLoRA paper recommendations
args = SFTConfig(
    # Output settings
    output_dir=finetune_name,  # Directory to save model checkpoints
    # Training duration
    num_train_epochs=5,  # with LORA, epochs should be small like 1-3, otherwise may overfit
    # Batch size settings
    per_device_train_batch_size=4,  # Batch size per GPU. >4 may cause gpu mem issues for T4
    gradient_accumulation_steps=4,  # Accumulate gradients for larger effective batch
    # Memory optimization
    gradient_checkpointing=True,  # Trade compute for memory savings
    # Optimizer settings
    optim="adamw_torch_fused",  # Use fused AdamW for efficiency
    learning_rate=2e-4,  # Learning rate (QLoRA paper)
    max_grad_norm=0.3,  # Gradient clipping threshold
    # Learning rate schedule
    warmup_ratio=0.03,  # Portion of steps for warmup
    lr_scheduler_type="constant",  # Keep learning rate constant after warmup
    # Logging and saving
    logging_steps=10,  # Log metrics every N steps
    save_strategy="epoch",  # Save checkpoint every epoch
    # Precision settings
    bf16=True if torch.cuda.is_bf16_supported() else False,  # Use bfloat16 precision
    # Integration settings
    push_to_hub=False,  # Don't push to HuggingFace Hub
    report_to="none",  # Disable external logging
    max_length = 1512,  # max sequence length for model and packing of the dataset
    packing=True,  # Enable input packing for efficiency
    dataset_kwargs={
        "add_special_tokens": False,  # Special tokens handled by template
        "append_concat_token": False,  # No additional separator needed
    },

)

In [7]:
#max_seq_length is deprecated, use max_length from SFTConfig instead: https://huggingface.co/docs/peft/main/en/install
#packing also moved to sftconfig
#dataset_kwargs moved to sftconfig
# Create SFTTrainer with LoRA configuration
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,  # LoRA configuration
    processing_class=tokenizer, #toeknizer param updated to processing_class

)



Adding EOS to train dataset:   0%|          | 0/3750 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3750 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/3750 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/1250 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1250 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model (since using peft, will only save the adapter model and not the full model)
trainer.save_model()

  return fn(*args, **kwargs)


Step,Training Loss


  return fn(*args, **kwargs)
