In [None]:
!pip install -U transformers datasets accelerate peft bitsandbytes trl

In [None]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [2]:
!pip show trl

Name: trl
Version: 0.17.0
Summary: Train transformer language models with reinforcement learning.
Home-page: https://github.com/huggingface/trl
Author: Leandro von Werra
Author-email: leandro.vonwerra@gmail.com
License: Apache 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: accelerate, datasets, rich, transformers
Required-by: 


In [4]:
!pip install trl==0.8.6
!pip install tensorboard

Installing collected packages: shtab, tyro, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.17.0
    Uninstalling trl-0.17.0:
      Successfully uninstalled trl-0.17.0
Successfully installed shtab-1.7.2 trl-0.8.6 tyro-0.9.19


In [7]:
!pip cache purge

Files removed: 132


In [None]:
# pip install -U torch

In [None]:
!nvidia-smi

Sun Apr 27 19:02:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15              Driver Version: 570.86.15      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:1E.0 Off |                    0 |
| N/A   40C    P0             26W /   70W |    5649MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# !rm -rf ./gemma-finetuned-adapters

# **IMPROVED**

In [9]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
# from huggingface_hub import notebook_login # Uncomment if using notebook login

# --- Configuration ---
MODEL_ID = "google/gemma-3-1b-it"
DATASET_PATH = "llm_training_data_claude-22.jsonl" # Replace with your dataset path
OUTPUT_DIR = "./gemma-finetuned-adapters" # Will be created fresh

# --- Authentication ---
# IMPORTANT: Replace the placeholder token below with your actual Hugging Face token.
# Using a manual token string like this is ONE way to authenticate.
# The recommended way is usually `huggingface-cli login` or `notebook_login()`.
# Choose ONE method only. If you set the token here, you don't need to run login commands.
token = "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" # Replace with your actual token from HF settings

# Make sure you are logged in via `huggingface-cli login` OR uncomment and run:
# notebook_login() # Only if you are in a notebook and token is None

# --- QLoRA Configuration ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Set based on GPU check below
    bnb_4bit_use_double_quant=False,
)

# --- LoRA Configuration ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ],
    task_type="CAUSAL_LM",
)

# --- Training Arguments ---
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=500,             # Starting with 3 epochs (adjust as needed)
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4, # Effective batch size = per_device_train_batch_size * gradient_accumulation_steps
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=100,                 # Save checkpoint every 100 steps (adjust as needed)
    logging_steps=25,               # Log every 25 steps (adjust as needed)
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,                     # Will be set based on GPU check
    bf16=False,                     # Will be set based on GPU check
    max_grad_norm=0.3,
    max_steps=-1,                   # Train for num_train_epochs, or set a positive value for max steps
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

# --- Check GPU Availability & Set Precisions ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available.")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")
if torch.cuda.get_device_capability(0)[0] >= 8:
    print("GPU supports bfloat16. Setting bf16=True, fp16=False.")
    training_arguments.bf16 = True
    training_arguments.fp16 = False
    bnb_config.bnb_4bit_compute_dtype = torch.bfloat16
else:
    print("GPU does NOT support bfloat16. Setting bf16=False, fp16=True.")
    training_arguments.bf16 = False
    training_arguments.fp16 = True
    bnb_config.bnb_4bit_compute_dtype = torch.float16


# --- Load Tokenizer and Model ---
print(f"Loading tokenizer for model: {MODEL_ID}")
# This dictionary correctly adds the 'token' argument if the token variable is set
tokenizer_kwargs = {"trust_remote_code": True}
if token: tokenizer_kwargs["token"] = token
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **tokenizer_kwargs)
if tokenizer.pad_token is None:
    print("Setting pad_token = eos_token")
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Loading base model: {MODEL_ID} with QLoRA config")
# This dictionary correctly adds the 'token' argument if the token variable is set
model_kwargs = {
    "quantization_config": bnb_config,
    "device_map": "auto",
    "trust_remote_code": True,
    "attn_implementation": "eager", # Use eager attention for Gemma3 (Recommended by HF docs)
}
if token: model_kwargs["token"] = token
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)

# --- Load Dataset ---
print(f"Loading dataset from: {DATASET_PATH}")
try:
    # Assuming the dataset is a JSON Lines file
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    print(f"Dataset loaded successfully. Number of examples: {len(dataset)}")
except FileNotFoundError:
     print(f"Error: Dataset file not found at {DATASET_PATH}"); exit()
except Exception as e:
    print(f"Error loading dataset: {e}"); exit()

# --- Preprocess Dataset ---
# This function formats each data sample into a single text string
# suitable for Causal LM training, using the Gemma-specific instruction format.
def format_instruction(sample):
    # Extract instruction and output, handle potential None or empty strings
    instruction = sample.get('instruction', '') or ''
    output = sample.get('output', '') or ''

    # Clean up potential leftover tokens/whitespace from data source
    instruction = instruction.strip().removeprefix("<s>").removesuffix("</s>").strip()
    output = output.strip().removeprefix("<s>").removesuffix("</s>").strip()

    # Format according to Gemma's instruction format: <s>[INST] Instruction [/INST] Output </s>
    # Ensure there's content before formatting
    if not instruction:
        print(f"Warning: Skipping sample with no instruction: {sample}")
        return {"text": ""} # Return empty text for samples to be filtered out
    if not output:
         print(f"Warning: Sample with instruction but no output: {sample}")
         # Optionally format without output if you want to train on prompts only
         # For this setup (SFT), we usually expect prompt-response pairs
         # Returning empty text will effectively skip this sample for training
         return {"text": ""}


    return f"<s>[INST] {instruction} [/INST] {output} </s>"

print("Formatting dataset...")
try:
    # Apply the formatting function to each sample
    dataset = dataset.map(lambda sample: {"text": format_instruction(sample)})
    # Filter out samples that resulted in empty text (e.g., missing instruction/output)
    dataset = dataset.filter(lambda sample: sample["text"] != "")

    print("Dataset formatted and filtered.")
    if not dataset: print("Warning: Processed dataset is empty. Check your data file and formatting function."); exit()
    print(f"Example formatted text:\n{dataset[0]['text']}")
except Exception as e:
    print(f"Error formatting dataset: {e}"); exit()

# --- Initialize Trainer ---
print("Initializing SFTTrainer...")
# SFTTrainer handles wrapping the base model with LoRA adapters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    dataset_text_field="text", # The column containing the formatted text
    max_seq_length=512,        # Maximum sequence length for training
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,             # Set to True for more efficient training if max_seq_length varies widely
)

# --- Train the Model ---
print("Starting training...")
try:
    # If you had a previous checkpoint and want to resume, uncomment the line below
    # trainer.train(resume_from_checkpoint=True)
    trainer.train() # Start training from scratch
    print("Training finished.")
except Exception as e:
    print(f"Error during training: {e}"); exit()

# --- Save the Trained Adapters ---
# SFTTrainer automatically saves the model/adapters to output_dir
print(f"Saving LoRA adapters to: {OUTPUT_DIR}")
trainer.save_model()
print("Adapters saved successfully.")

# --- Optional: Clean up memory ---
# Free up GPU memory by deleting model and trainer objects
print("Cleaning up memory...")
if 'model' in locals(): del model
if 'trainer' in locals(): del trainer
# Additional manual cleanup for objects potentially holding onto memory
if 'tokenizer' in locals() and hasattr(tokenizer, 'model'): del tokenizer.model
if 'pipe' in locals() and hasattr(pipe, 'model'): del pipe.model
if 'inf_model' in locals(): del inf_model
torch.cuda.empty_cache() # Clear cached memory
print("Memory cleaned.")

# --- Optional: Test Inference ---
# Load the base model again and merge the adapters for inference
print("\n--- Testing Inference with Adapters ---")
# Set logging verbosity to critical to avoid excessive output from transformers
logging.set_verbosity(logging.CRITICAL)
try:
    # Load base model for inference - use float16 compute dtype typically for inference
    print("Loading base model for inference...")
    # Use a fresh BitsAndBytesConfig for inference if needed, or reuse the training one
    # Ensure compute_dtype is compatible with your GPU for inference, often float16 is safe
    inf_bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16, # Use float16 for inference compute
        bnb_4bit_use_double_quant=False )
    inf_model_kwargs = { "quantization_config": inf_bnb_config, "device_map": "auto",
                         "trust_remote_code": True, "attn_implementation": "eager" }
    if token: inf_model_kwargs["token"] = token # Pass token for inference model load too
    inf_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **inf_model_kwargs)

    # Load the trained LoRA adapters
    print(f"Loading adapters from: {OUTPUT_DIR}")
    # PeftModel wraps the base model and applies the adapters
    inf_model = PeftModel.from_pretrained(inf_model, OUTPUT_DIR)
    print("Adapters loaded onto base model.")

    # Load the tokenizer again for inference
    inf_tokenizer_kwargs = {"trust_remote_code": True}
    if token: inf_tokenizer_kwargs["token"] = token # Pass token for inference tokenizer load
    inf_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **inf_tokenizer_kwargs)
    if inf_tokenizer.pad_token is None: inf_tokenizer.pad_token = inf_tokenizer.eos_token
    inf_tokenizer.padding_side = "right" # For generation, padding side should be 'right'

    # Prepare a test prompt from the dataset's first example
    # We need to extract just the instruction part for the prompt
    test_formatted_text = dataset[0]['text']
    inst_end_marker = "[/INST]"
    prompt_end_index = test_formatted_text.find(inst_end_marker)

    if prompt_end_index != -1:
        # Extract only the part up to and including the instruction end marker
        # This simulates giving the model an instruction to complete
        prompt_for_model = test_formatted_text[:prompt_end_index + len(inst_end_marker)].strip()
    else:
        # Fallback in case the formatting wasn't perfect, try to reconstruct the prompt
        # This might not be ideal if the formatting function changes
        print("Warning: Could not find [/INST] marker in formatted text. Attempting to reconstruct prompt.")
        prompt_for_model = f"<s>[INST] {dataset[0].get('instruction', '').strip()} [/INST]"

    # Ensure the prompt starts with the correct BOS token if not already present
    if not prompt_for_model.startswith(inf_tokenizer.bos_token):
         prompt_for_model = inf_tokenizer.bos_token + prompt_for_model

    print(f"\nTest Prompt:\n---\n{prompt_for_model}\n---")

    # Create a text generation pipeline using the adapted model and tokenizer
    pipe = pipeline(task="text-generation", model=inf_model, tokenizer=inf_tokenizer, max_new_tokens=150)

    # Generate text based on the prompt
    # The pipe will handle tokenizing the prompt, running inference, and decoding
    result = pipe(prompt_for_model)

    print("\nGenerated Output:")
    # The result is a list of dictionaries. We take the text from the first result.
    generated_text = result[0]['generated_text']

    # Post-process the generated text to show only the generated response
    # Find the end of the prompt in the generated text
    prompt_end_in_generated = generated_text.find(prompt_for_model)
    if prompt_end_in_generated != -1:
      # Start of the generated response is after the prompt
      answer_start_index = prompt_end_in_generated + len(prompt_for_model)
      answer = generated_text[answer_start_index:].strip()

      # Remove the final EOS token if it exists
      if answer.endswith(inf_tokenizer.eos_token):
          answer = answer[:-len(inf_tokenizer.eos_token)].strip()
      print(answer)
    else:
      # If the prompt wasn't found in the generated text (shouldn't happen with pipeline),
      # print the whole generation or a warning
      print("Warning: Prompt not found in generated text. Printing full output:")
      print(generated_text)


    # Print the expected output from the dataset for comparison
    print("\nExpected Output (from dataset):")
    print(dataset[0].get('output', 'N/A')) # Use .get for safety


except Exception as e:
    print(f"\nError during inference testing: {e}")

print("\n--- Script Finished ---")

CUDA available: True
Device name: Tesla T4
GPU does NOT support bfloat16. Setting bf16=False, fp16=True.
Loading tokenizer for model: google/gemma-3-1b-it


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Loading base model: google/gemma-3-1b-it with QLoRA config


config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Loading dataset from: llm_training_data_claude-22.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully. Number of examples: 596
Formatting dataset...


Map:   0%|          | 0/596 [00:00<?, ? examples/s]

Filter:   0%|          | 0/596 [00:00<?, ? examples/s]

Dataset formatted and filtered.
Example formatted text:
<s>[INST] Extract the following information from the given text of driving license: Address, First_Name, Height, Last_Name, Sex, State

Text: Driver
ucend - No
j U 216 *
Licence
JONES MARGARET CHERYL
50 SMITH ST
SMITHVILLE QLD 9999
itninre
aply
26-03-80 F 170
01-07-2006 30-06-2011
Queensland
Goverament
eens and Transport [/INST] {"Last_Name": "JONES", "First_Name": "MARGARET CHERYL", "Address": "50 SMITH ST\nSMITHVILLE QLD 9999", "Sex": "F", "Height": "170", "State": "Queensland"} </s>
Initializing SFTTrainer...


Map:   0%|          | 0/596 [00:00<?, ? examples/s]

  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss
25,2.2728
50,1.6643


KeyboardInterrupt: 

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
# from huggingface_hub import notebook_login # Uncomment if using notebook login

# --- Configuration ---
MODEL_ID = "google/gemma-3-4b-it"
DATASET_PATH = "llm_training_data_claude-22.jsonl" # Replace with your dataset path
OUTPUT_DIR = "./gemma-finetuned-adapters" # Will be created fresh

# --- Authentication ---
# IMPORTANT: Replace the placeholder token below with your actual Hugging Face token.
# Using a manual token string like this is ONE way to authenticate.
# The recommended way is usually `huggingface-cli login` or `notebook_login()`.
# Choose ONE method only. If you set the token here, you don't need to run login commands.
token = "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" # Replace with your actual token from HF settings

# Make sure you are logged in via `huggingface-cli login` OR uncomment and run:
# notebook_login() # Only if you are in a notebook and token is None

# --- QLoRA Configuration ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Set based on GPU check below
    bnb_4bit_use_double_quant=False,
)

# --- LoRA Configuration ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ],
    task_type="CAUSAL_LM",
)

# --- Training Arguments ---
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=500,             # Starting with 3 epochs (adjust as needed)
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4, # Effective batch size = per_device_train_batch_size * gradient_accumulation_steps
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=100,                 # Save checkpoint every 100 steps (adjust as needed)
    logging_steps=25,               # Log every 25 steps (adjust as needed)
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,                     # Will be set based on GPU check
    bf16=False,                     # Will be set based on GPU check
    max_grad_norm=0.3,
    max_steps=-1,                   # Train for num_train_epochs, or set a positive value for max steps
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

# --- Check GPU Availability & Set Precisions ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available.")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")
if torch.cuda.get_device_capability(0)[0] >= 8:
    print("GPU supports bfloat16. Setting bf16=True, fp16=False.")
    training_arguments.bf16 = True
    training_arguments.fp16 = False
    bnb_config.bnb_4bit_compute_dtype = torch.bfloat16
else:
    print("GPU does NOT support bfloat16. Setting bf16=False, fp16=True.")
    training_arguments.bf16 = False
    training_arguments.fp16 = True
    bnb_config.bnb_4bit_compute_dtype = torch.float16


# --- Load Tokenizer and Model ---
print(f"Loading tokenizer for model: {MODEL_ID}")
# This dictionary correctly adds the 'token' argument if the token variable is set
tokenizer_kwargs = {"trust_remote_code": True}
if token: tokenizer_kwargs["token"] = token
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **tokenizer_kwargs)
if tokenizer.pad_token is None:
    print("Setting pad_token = eos_token")
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Loading base model: {MODEL_ID} with QLoRA config")
# This dictionary correctly adds the 'token' argument if the token variable is set
model_kwargs = {
    "quantization_config": bnb_config,
    "device_map": "auto",
    "trust_remote_code": True,
    "attn_implementation": "eager", # Use eager attention for Gemma3 (Recommended by HF docs)
}
if token: model_kwargs["token"] = token
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)

# --- Load Dataset ---
print(f"Loading dataset from: {DATASET_PATH}")
try:
    # Assuming the dataset is a JSON Lines file
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    print(f"Dataset loaded successfully. Number of examples: {len(dataset)}")
except FileNotFoundError:
     print(f"Error: Dataset file not found at {DATASET_PATH}"); exit()
except Exception as e:
    print(f"Error loading dataset: {e}"); exit()

# --- Preprocess Dataset ---
# This function formats each data sample into a single text string
# suitable for Causal LM training, using the Gemma-specific instruction format.
def format_instruction(sample):
    # Extract instruction and output, handle potential None or empty strings
    instruction = sample.get('instruction', '') or ''
    output = sample.get('output', '') or ''

    # Clean up potential leftover tokens/whitespace from data source
    instruction = instruction.strip().removeprefix("<s>").removesuffix("</s>").strip()
    output = output.strip().removeprefix("<s>").removesuffix("</s>").strip()

    # Format according to Gemma's instruction format: <s>[INST] Instruction [/INST] Output </s>
    # Ensure there's content before formatting
    if not instruction:
        print(f"Warning: Skipping sample with no instruction: {sample}")
        return {"text": ""} # Return empty text for samples to be filtered out
    if not output:
         print(f"Warning: Sample with instruction but no output: {sample}")
         # Optionally format without output if you want to train on prompts only
         # For this setup (SFT), we usually expect prompt-response pairs
         # Returning empty text will effectively skip this sample for training
         return {"text": ""}


    return f"<s>[INST] {instruction} [/INST] {output} </s>"

print("Formatting dataset...")
try:
    # Apply the formatting function to each sample
    dataset = dataset.map(lambda sample: {"text": format_instruction(sample)})
    # Filter out samples that resulted in empty text (e.g., missing instruction/output)
    dataset = dataset.filter(lambda sample: sample["text"] != "")

    print("Dataset formatted and filtered.")
    if not dataset: print("Warning: Processed dataset is empty. Check your data file and formatting function."); exit()
    print(f"Example formatted text:\n{dataset[0]['text']}")
except Exception as e:
    print(f"Error formatting dataset: {e}"); exit()

# --- Initialize Trainer ---
print("Initializing SFTTrainer...")
# SFTTrainer handles wrapping the base model with LoRA adapters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    dataset_text_field="text", # The column containing the formatted text
    max_seq_length=512,        # Maximum sequence length for training
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,             # Set to True for more efficient training if max_seq_length varies widely
)

# --- Train the Model ---
print("Starting training...")
try:
    # If you had a previous checkpoint and want to resume, uncomment the line below
    # trainer.train(resume_from_checkpoint=True)
    trainer.train() # Start training from scratch
    print("Training finished.")
except Exception as e:
    print(f"Error during training: {e}"); exit()

# --- Save the Trained Adapters ---
# SFTTrainer automatically saves the model/adapters to output_dir
print(f"Saving LoRA adapters to: {OUTPUT_DIR}")
trainer.save_model()
print("Adapters saved successfully.")

# --- Optional: Clean up memory ---
# Free up GPU memory by deleting model and trainer objects
print("Cleaning up memory...")
if 'model' in locals(): del model
if 'trainer' in locals(): del trainer
# Additional manual cleanup for objects potentially holding onto memory
if 'tokenizer' in locals() and hasattr(tokenizer, 'model'): del tokenizer.model
if 'pipe' in locals() and hasattr(pipe, 'model'): del pipe.model
if 'inf_model' in locals(): del inf_model
torch.cuda.empty_cache() # Clear cached memory
print("Memory cleaned.")

# --- Optional: Test Inference ---
# Load the base model again and merge the adapters for inference
print("\n--- Testing Inference with Adapters ---")
# Set logging verbosity to critical to avoid excessive output from transformers
logging.set_verbosity(logging.CRITICAL)
try:
    # Load base model for inference - use float16 compute dtype typically for inference
    print("Loading base model for inference...")
    # Use a fresh BitsAndBytesConfig for inference if needed, or reuse the training one
    # Ensure compute_dtype is compatible with your GPU for inference, often float16 is safe
    inf_bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16, # Use float16 for inference compute
        bnb_4bit_use_double_quant=False )
    inf_model_kwargs = { "quantization_config": inf_bnb_config, "device_map": "auto",
                         "trust_remote_code": True, "attn_implementation": "eager" }
    if token: inf_model_kwargs["token"] = token # Pass token for inference model load too
    inf_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **inf_model_kwargs)

    # Load the trained LoRA adapters
    print(f"Loading adapters from: {OUTPUT_DIR}")
    # PeftModel wraps the base model and applies the adapters
    inf_model = PeftModel.from_pretrained(inf_model, OUTPUT_DIR)
    print("Adapters loaded onto base model.")

    # Load the tokenizer again for inference
    inf_tokenizer_kwargs = {"trust_remote_code": True}
    if token: inf_tokenizer_kwargs["token"] = token # Pass token for inference tokenizer load
    inf_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **inf_tokenizer_kwargs)
    if inf_tokenizer.pad_token is None: inf_tokenizer.pad_token = inf_tokenizer.eos_token
    inf_tokenizer.padding_side = "right" # For generation, padding side should be 'right'

    # Prepare a test prompt from the dataset's first example
    # We need to extract just the instruction part for the prompt
    test_formatted_text = dataset[0]['text']
    inst_end_marker = "[/INST]"
    prompt_end_index = test_formatted_text.find(inst_end_marker)

    if prompt_end_index != -1:
        # Extract only the part up to and including the instruction end marker
        # This simulates giving the model an instruction to complete
        prompt_for_model = test_formatted_text[:prompt_end_index + len(inst_end_marker)].strip()
    else:
        # Fallback in case the formatting wasn't perfect, try to reconstruct the prompt
        # This might not be ideal if the formatting function changes
        print("Warning: Could not find [/INST] marker in formatted text. Attempting to reconstruct prompt.")
        prompt_for_model = f"<s>[INST] {dataset[0].get('instruction', '').strip()} [/INST]"

    # Ensure the prompt starts with the correct BOS token if not already present
    if not prompt_for_model.startswith(inf_tokenizer.bos_token):
         prompt_for_model = inf_tokenizer.bos_token + prompt_for_model

    print(f"\nTest Prompt:\n---\n{prompt_for_model}\n---")

    # Create a text generation pipeline using the adapted model and tokenizer
    pipe = pipeline(task="text-generation", model=inf_model, tokenizer=inf_tokenizer, max_new_tokens=150)

    # Generate text based on the prompt
    # The pipe will handle tokenizing the prompt, running inference, and decoding
    result = pipe(prompt_for_model)

    print("\nGenerated Output:")
    # The result is a list of dictionaries. We take the text from the first result.
    generated_text = result[0]['generated_text']

    # Post-process the generated text to show only the generated response
    # Find the end of the prompt in the generated text
    prompt_end_in_generated = generated_text.find(prompt_for_model)
    if prompt_end_in_generated != -1:
      # Start of the generated response is after the prompt
      answer_start_index = prompt_end_in_generated + len(prompt_for_model)
      answer = generated_text[answer_start_index:].strip()

      # Remove the final EOS token if it exists
      if answer.endswith(inf_tokenizer.eos_token):
          answer = answer[:-len(inf_tokenizer.eos_token)].strip()
      print(answer)
    else:
      # If the prompt wasn't found in the generated text (shouldn't happen with pipeline),
      # print the whole generation or a warning
      print("Warning: Prompt not found in generated text. Printing full output:")
      print(generated_text)


    # Print the expected output from the dataset for comparison
    print("\nExpected Output (from dataset):")
    print(dataset[0].get('output', 'N/A')) # Use .get for safety


except Exception as e:
    print(f"\nError during inference testing: {e}")

print("\n--- Script Finished ---")

CUDA available: True
Device name: Tesla T4
GPU does NOT support bfloat16. Setting bf16=False, fp16=True.
Loading tokenizer for model: google/gemma-3-4b-it
Loading base model: google/gemma-3-4b-it with QLoRA config


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading dataset from: llm_training_data_claude-22.jsonl
Dataset loaded successfully. Number of examples: 596
Formatting dataset...
Dataset formatted and filtered.
Example formatted text:
<s>[INST] Extract the following information from the given text of driving license: Address, First_Name, Height, Last_Name, Sex, State

Text: Driver
ucend - No
j U 216 *
Licence
JONES MARGARET CHERYL
50 SMITH ST
SMITHVILLE QLD 9999
itninre
aply
26-03-80 F 170
01-07-2006 30-06-2011
Queensland
Goverament
eens and Transport [/INST] {"Last_Name": "JONES", "First_Name": "MARGARET CHERYL", "Address": "50 SMITH ST\nSMITHVILLE QLD 9999", "Sex": "F", "Height": "170", "State": "Queensland"} </s>
Initializing SFTTrainer...


  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss
25,9.3155
50,5.6732
75,5.4292
100,4.9246
125,4.9554
150,4.6304
175,4.0147
200,3.7639
225,3.9222
250,4.1808


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Cannot access gated repo for url https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json.
Access to model google/gemma-3-4b-it is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-3-4b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json.
Access to model google/gemma-3-4b-it is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the fi

KeyboardInterrupt: 

In [5]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# # Go to YOLOv8 root folder
# %cd /content/drive/MyDrive/

# # Create folder
# !mkdir Argentina

# # Go to YOLOv8 root folder
# %cd /content/drive/MyDrive/Argentina

Mounted at /content/drive


In [6]:
import shutil
import os

source_folder = "/content/gemma-finetuned-adapters"
destination_folder = "/content/drive/MyDrive/DS-OLD/gemma-3-regular"

# Create destination folder and any parent directories if they don't exist
os.makedirs(destination_folder, exist_ok=True)

# Get all files and directories in the source folder
for item in os.listdir(source_folder):
    source_item = os.path.join(source_folder, item)
    destination_item = os.path.join(destination_folder, item)

    # If it's a directory, copy the directory
    if os.path.isdir(source_item):
        # Check if destination directory exists
        if os.path.exists(destination_item):
            shutil.rmtree(destination_item)  # Remove existing directory
        shutil.copytree(source_item, destination_item)
    # If it's a file, copy the file
    else:
        shutil.copy2(source_item, destination_item)

print(f"All items copied from {source_folder} to {destination_folder}")

All items copied from /content/gemma-finetuned-adapters to /content/drive/MyDrive/DS-OLD/gemma-3-regular


In [1]:
# Go to YOLOv8 root folder
import os

target_dir = "/content/drive/MyDrive/DS-OLD/"

# Create the directory if it doesn't exist
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
    print(f"Created directory: {target_dir}")
else:
    print(f"Directory already exists: {target_dir}")

# Change to the directory
%cd "{target_dir}"

Directory already exists: /content/drive/MyDrive/DS-OLD/
/content/drive/MyDrive/DS-OLD


# Continue Training

In [2]:
import shutil
import os

def remove_folder(folder_path):
    """
    Remove a folder and all its contents

    Args:
        folder_path: Path to the folder you want to remove
    """
    if os.path.exists(folder_path):
        if os.path.isdir(folder_path):
            try:
                # Remove directory and all its contents
                shutil.rmtree(folder_path)
                print(f"Successfully removed folder: {folder_path}")
            except Exception as e:
                print(f"Error removing folder: {e}")
        else:
            print(f"The path exists but is not a directory: {folder_path}")
    else:
        print(f"Folder does not exist: {folder_path}")

# Example usage
remove_folder('/content/gemma-finetuned-adapters/checkpoint-100')


Successfully removed folder: /content/gemma-finetuned-adapters/checkpoint-100


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
# from huggingface_hub import notebook_login # Uncomment if using notebook login

# --- Configuration ---
MODEL_ID = "google/gemma-3-4b-it"
DATASET_PATH = "llm_training_data_claude-22.jsonl" # Replace with your dataset path
OUTPUT_DIR = "gemma-finetuned-adapters" # Will be created fresh

# --- Authentication ---
# IMPORTANT: Replace the placeholder token below with your actual Hugging Face token.
# Using a manual token string like this is ONE way to authenticate.
# The recommended way is usually `huggingface-cli login` or `notebook_login()`.
# Choose ONE method only. If you set the token here, you don't need to run login commands.
token = "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" # Replace with your actual token from HF settings

# Make sure you are logged in via `huggingface-cli login` OR uncomment and run:
# notebook_login() # Only if you are in a notebook and token is None

# --- QLoRA Configuration ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Set based on GPU check below
    bnb_4bit_use_double_quant=False,
)

# --- LoRA Configuration ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ],
    task_type="CAUSAL_LM",
)

# --- Training Arguments ---
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=500,             # Starting with 3 epochs (adjust as needed)
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4, # Effective batch size = per_device_train_batch_size * gradient_accumulation_steps
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=100,                 # Save checkpoint every 100 steps (adjust as needed)
    logging_steps=25,               # Log every 25 steps (adjust as needed)
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,                     # Will be set based on GPU check
    bf16=False,                     # Will be set based on GPU check
    max_grad_norm=0.3,
    max_steps=-1,                   # Train for num_train_epochs, or set a positive value for max steps
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

# --- Check GPU Availability & Set Precisions ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available.")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")
if torch.cuda.get_device_capability(0)[0] >= 8:
    print("GPU supports bfloat16. Setting bf16=True, fp16=False.")
    training_arguments.bf16 = True
    training_arguments.fp16 = False
    bnb_config.bnb_4bit_compute_dtype = torch.bfloat16
else:
    print("GPU does NOT support bfloat16. Setting bf16=False, fp16=True.")
    training_arguments.bf16 = False
    training_arguments.fp16 = True
    bnb_config.bnb_4bit_compute_dtype = torch.float16


# --- Load Tokenizer and Model ---
print(f"Loading tokenizer for model: {MODEL_ID}")
# This dictionary correctly adds the 'token' argument if the token variable is set
tokenizer_kwargs = {"trust_remote_code": True}
if token: tokenizer_kwargs["token"] = token
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **tokenizer_kwargs)
if tokenizer.pad_token is None:
    print("Setting pad_token = eos_token")
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Loading base model: {MODEL_ID} with QLoRA config")
# This dictionary correctly adds the 'token' argument if the token variable is set
model_kwargs = {
    "quantization_config": bnb_config,
    "device_map": "auto",
    "trust_remote_code": True,
    "attn_implementation": "eager", # Use eager attention for Gemma3 (Recommended by HF docs)
}
if token: model_kwargs["token"] = token
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)

# --- Load Dataset ---
print(f"Loading dataset from: {DATASET_PATH}")
try:
    # Assuming the dataset is a JSON Lines file
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    print(f"Dataset loaded successfully. Number of examples: {len(dataset)}")
except FileNotFoundError:
     print(f"Error: Dataset file not found at {DATASET_PATH}"); exit()
except Exception as e:
    print(f"Error loading dataset: {e}"); exit()

# --- Preprocess Dataset ---
# This function formats each data sample into a single text string
# suitable for Causal LM training, using the Gemma-specific instruction format.
def format_instruction(sample):
    # Extract instruction and output, handle potential None or empty strings
    instruction = sample.get('instruction', '') or ''
    output = sample.get('output', '') or ''

    # Clean up potential leftover tokens/whitespace from data source
    instruction = instruction.strip().removeprefix("<s>").removesuffix("</s>").strip()
    output = output.strip().removeprefix("<s>").removesuffix("</s>").strip()

    # Format according to Gemma's instruction format: <s>[INST] Instruction [/INST] Output </s>
    # Ensure there's content before formatting
    if not instruction:
        print(f"Warning: Skipping sample with no instruction: {sample}")
        return {"text": ""} # Return empty text for samples to be filtered out
    if not output:
         print(f"Warning: Sample with instruction but no output: {sample}")
         # Optionally format without output if you want to train on prompts only
         # For this setup (SFT), we usually expect prompt-response pairs
         # Returning empty text will effectively skip this sample for training
         return {"text": ""}


    return f"<s>[INST] {instruction} [/INST] {output} </s>"

print("Formatting dataset...")
try:
    # Apply the formatting function to each sample
    dataset = dataset.map(lambda sample: {"text": format_instruction(sample)})
    # Filter out samples that resulted in empty text (e.g., missing instruction/output)
    dataset = dataset.filter(lambda sample: sample["text"] != "")

    print("Dataset formatted and filtered.")
    if not dataset: print("Warning: Processed dataset is empty. Check your data file and formatting function."); exit()
    print(f"Example formatted text:\n{dataset[0]['text']}")
except Exception as e:
    print(f"Error formatting dataset: {e}"); exit()

# --- Initialize Trainer ---
print("Initializing SFTTrainer...")
# SFTTrainer handles wrapping the base model with LoRA adapters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    dataset_text_field="text", # The column containing the formatted text
    max_seq_length=512,        # Maximum sequence length for training
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,             # Set to True for more efficient training if max_seq_length varies widely
)

# --- Train the Model ---
print("Starting training...")
try:
    # If you had a previous checkpoint and want to resume, uncomment the line below
    # trainer.train(resume_from_checkpoint=True)
    trainer.train(resume_from_checkpoint=True) # Resuming training
    print("Training finished.")
except Exception as e:
    print(f"Error during training: {e}"); exit()

# --- Save the Trained Adapters ---
# SFTTrainer automatically saves the model/adapters to output_dir
print(f"Saving LoRA adapters to: {OUTPUT_DIR}")
trainer.save_model()
print("Adapters saved successfully.")

# --- Optional: Clean up memory ---
# Free up GPU memory by deleting model and trainer objects
print("Cleaning up memory...")
if 'model' in locals(): del model
if 'trainer' in locals(): del trainer
# Additional manual cleanup for objects potentially holding onto memory
if 'tokenizer' in locals() and hasattr(tokenizer, 'model'): del tokenizer.model
if 'pipe' in locals() and hasattr(pipe, 'model'): del pipe.model
if 'inf_model' in locals(): del inf_model
torch.cuda.empty_cache() # Clear cached memory
print("Memory cleaned.")

# --- Optional: Test Inference ---
# Load the base model again and merge the adapters for inference
print("\n--- Testing Inference with Adapters ---")
# Set logging verbosity to critical to avoid excessive output from transformers
logging.set_verbosity(logging.CRITICAL)
try:
    # Load base model for inference - use float16 compute dtype typically for inference
    print("Loading base model for inference...")
    # Use a fresh BitsAndBytesConfig for inference if needed, or reuse the training one
    # Ensure compute_dtype is compatible with your GPU for inference, often float16 is safe
    inf_bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16, # Use float16 for inference compute
        bnb_4bit_use_double_quant=False )
    inf_model_kwargs = { "quantization_config": inf_bnb_config, "device_map": "auto",
                         "trust_remote_code": True, "attn_implementation": "eager" }
    if token: inf_model_kwargs["token"] = token # Pass token for inference model load too
    inf_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **inf_model_kwargs)

    # Load the trained LoRA adapters
    print(f"Loading adapters from: {OUTPUT_DIR}")
    # PeftModel wraps the base model and applies the adapters
    inf_model = PeftModel.from_pretrained(inf_model, OUTPUT_DIR)
    print("Adapters loaded onto base model.")

    # Load the tokenizer again for inference
    inf_tokenizer_kwargs = {"trust_remote_code": True}
    if token: inf_tokenizer_kwargs["token"] = token # Pass token for inference tokenizer load
    inf_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **inf_tokenizer_kwargs)
    if inf_tokenizer.pad_token is None: inf_tokenizer.pad_token = inf_tokenizer.eos_token
    inf_tokenizer.padding_side = "right" # For generation, padding side should be 'right'

    # Prepare a test prompt from the dataset's first example
    # We need to extract just the instruction part for the prompt
    test_formatted_text = dataset[0]['text']
    inst_end_marker = "[/INST]"
    prompt_end_index = test_formatted_text.find(inst_end_marker)

    if prompt_end_index != -1:
        # Extract only the part up to and including the instruction end marker
        # This simulates giving the model an instruction to complete
        prompt_for_model = test_formatted_text[:prompt_end_index + len(inst_end_marker)].strip()
    else:
        # Fallback in case the formatting wasn't perfect, try to reconstruct the prompt
        # This might not be ideal if the formatting function changes
        print("Warning: Could not find [/INST] marker in formatted text. Attempting to reconstruct prompt.")
        prompt_for_model = f"<s>[INST] {dataset[0].get('instruction', '').strip()} [/INST]"

    # Ensure the prompt starts with the correct BOS token if not already present
    if not prompt_for_model.startswith(inf_tokenizer.bos_token):
         prompt_for_model = inf_tokenizer.bos_token + prompt_for_model

    print(f"\nTest Prompt:\n---\n{prompt_for_model}\n---")

    # Create a text generation pipeline using the adapted model and tokenizer
    pipe = pipeline(task="text-generation", model=inf_model, tokenizer=inf_tokenizer, max_new_tokens=150)

    # Generate text based on the prompt
    # The pipe will handle tokenizing the prompt, running inference, and decoding
    result = pipe(prompt_for_model)

    print("\nGenerated Output:")
    # The result is a list of dictionaries. We take the text from the first result.
    generated_text = result[0]['generated_text']

    # Post-process the generated text to show only the generated response
    # Find the end of the prompt in the generated text
    prompt_end_in_generated = generated_text.find(prompt_for_model)
    if prompt_end_in_generated != -1:
      # Start of the generated response is after the prompt
      answer_start_index = prompt_end_in_generated + len(prompt_for_model)
      answer = generated_text[answer_start_index:].strip()

      # Remove the final EOS token if it exists
      if answer.endswith(inf_tokenizer.eos_token):
          answer = answer[:-len(inf_tokenizer.eos_token)].strip()
      print(answer)
    else:
      # If the prompt wasn't found in the generated text (shouldn't happen with pipeline),
      # print the whole generation or a warning
      print("Warning: Prompt not found in generated text. Printing full output:")
      print(generated_text)


    # Print the expected output from the dataset for comparison
    print("\nExpected Output (from dataset):")
    print(dataset[0].get('output', 'N/A')) # Use .get for safety


except Exception as e:
    print(f"\nError during inference testing: {e}")

print("\n--- Script Finished ---")

CUDA available: True
Device name: Tesla T4
GPU does NOT support bfloat16. Setting bf16=False, fp16=True.
Loading tokenizer for model: google/gemma-3-4b-it
Loading base model: google/gemma-3-4b-it with QLoRA config


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading dataset from: llm_training_data_claude-22.jsonl
Dataset loaded successfully. Number of examples: 596
Formatting dataset...
Dataset formatted and filtered.
Example formatted text:
<s>[INST] Extract the following information from the given text of driving license: Address, First_Name, Height, Last_Name, Sex, State

Text: Driver
ucend - No
j U 216 *
Licence
JONES MARGARET CHERYL
50 SMITH ST
SMITHVILLE QLD 9999
itninre
aply
26-03-80 F 170
01-07-2006 30-06-2011
Queensland
Goverament
eens and Transport [/INST] {"Last_Name": "JONES", "First_Name": "MARGARET CHERYL", "Address": "50 SMITH ST\nSMITHVILLE QLD 9999", "Sex": "F", "Height": "170", "State": "Queensland"} </s>
Initializing SFTTrainer...


  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss
3125,0.255
3150,0.2682
3175,0.3055
3200,0.2824
3225,0.3041
3250,0.2999
3275,0.319
3300,0.2643
3325,0.267


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Cannot access gated repo for url https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json.
Access to model google/gemma-3-4b-it is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-3-4b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json.
Access to model google/gemma-3-4b-it is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the fi

# Zip

In [None]:
import zipfile
import os

# Function to zip a folder without downloading
def zip_folder(folder_path, output_zip_name):
    """
    Zip an entire folder in Colab

    Args:
        folder_path: Path to the folder you want to zip
        output_zip_name: Name for the output zip file
    """
    # Create a ZipFile object
    with zipfile.ZipFile(output_zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create the full file path
                file_path = os.path.join(root, file)
                # Calculate path within the zip file
                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                # Add file to zip
                zipf.write(file_path, arcname)

    print(f"Folder '{folder_path}' has been zipped to '{output_zip_name}'")

# Example usage
zip_folder('/content/gemma-finetuned-adapters', 'models.zip')

# Inference

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import PeftModel
import os

# --- Configuration (Should match your training script) ---

# Base Model identifier
MODEL_ID = "google/gemma-3-1b-it"

# Path where your trained LoRA adapters were saved
ADAPTER_PATH = "./gemma-finetuned-adapters" # Use the same OUTPUT_DIR from training

# Optional: Authentication token if required by the base model
# Define 'token' same way as in training if needed (CLI login, notebook_login, or string)
token = None # Or your "hf_..." token string if you used that method

# --- Determine Compute dtype based on GPU (Same logic as training) ---
if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8:
    compute_dtype = torch.bfloat16
    print("GPU supports bfloat16. Using bfloat16.")
else:
    compute_dtype = torch.float16
    print("GPU does NOT support bfloat16. Using float16.")

# --- QLoRA Configuration for Inference Loading ---
# Use the same quantization settings as training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype, # Use determined compute dtype
    bnb_4bit_use_double_quant=False,
)

# --- Check GPU Availability ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. Inference requires a GPU for this setup.")
print(f"Loading model onto device: cuda:{torch.cuda.current_device()}")


# --- Load the Base Model with Quantization ---
print(f"Loading base model: {MODEL_ID}")
model_kwargs = {
    "quantization_config": bnb_config,
    "device_map": "auto", # Load onto available GPU(s) automatically
    "trust_remote_code": True,
}
if token:
    model_kwargs["token"] = token

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    **model_kwargs
)
print("Base model loaded.")

# --- Load the Tokenizer ---
print(f"Loading tokenizer: {MODEL_ID}")
tokenizer_kwargs = {"trust_remote_code": True}
if token:
    tokenizer_kwargs["token"] = token
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **tokenizer_kwargs)

# Set pad token if needed (important for batching/padding during generation if used)
if tokenizer.pad_token is None:
    print("Setting pad_token = eos_token")
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Consistent with training
print("Tokenizer loaded.")

# --- Load the LoRA Adapters onto the Base Model ---
print(f"Loading LoRA adapters from: {ADAPTER_PATH}")
# This automatically merges the adapters onto the base model layers
inference_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
print("LoRA adapters loaded and merged onto base model.")

# --- Prepare for Inference ---
logging.set_verbosity(logging.CRITICAL) # Suppress verbose generation messages
pipe = pipeline(
    task="text-generation",
    model=inference_model, # Use the model with adapters loaded
    tokenizer=tokenizer,
    # device=0 # pipeline usually handles device placement with device_map="auto"
)
print("Inference pipeline ready.")

# --- Example Inference ---
# **IMPORTANT**: Format your prompt EXACTLY like you did for training!
test_instruction = '''Extract information from the given text of driving license: California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        ISS 08/31/2009
'''

# Replace with your actual test instruction text
prompt = f"<s>[INST] {test_instruction.strip()} [/INST]"

print(f"\n--- Running Inference ---")
print(f"Prompt:\n{prompt}")

# Set generation parameters
# max_new_tokens determines how many tokens to generate *after* the prompt
generation_args = {
    "max_new_tokens": 100, # Adjust as needed
    "do_sample": True,    # Whether to sample; False means greedy decoding
    "temperature": 0.7, # Controls randomness (lower = more deterministic) - use if do_sample=True
    "top_k": 50,        # Considers top_k tokens for sampling - use if do_sample=True
    "top_p": 0.95,      # Uses nucleus sampling - use if do_sample=True
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id, # Often set same as EOS for generation
}

try:
    outputs = pipe(prompt, **generation_args)

    print("\nGenerated Output:")
    # Extract only the generated part (after the prompt)
    generated_text = outputs[0]['generated_text']
    inst_end_marker = "[/INST]"
    answer_start_index = generated_text.find(inst_end_marker)
    if answer_start_index != -1:
        answer = generated_text[answer_start_index + len(inst_end_marker):].strip()
        # Remove potential EOS token at the end if needed
        if answer.endswith(tokenizer.eos_token):
            answer = answer[:-len(tokenizer.eos_token)].strip()
        print(answer)
    else:
       print(generated_text) # Print full output if prompt marker wasn't found

except Exception as e:
    print(f"Error during generation: {e}")

print("\n--- Inference Finished ---")

GPU does NOT support bfloat16. Using float16.
Loading model onto device: cuda:0
Loading base model: google/gemma-3-1b-it
Base model loaded.
Loading tokenizer: google/gemma-3-1b-it
Tokenizer loaded.
Loading LoRA adapters from: ./gemma-finetuned-adapters
LoRA adapters loaded and merged onto base model.
Inference pipeline ready.

--- Running Inference ---
Prompt:
<s>[INST] Extract information from the given text of driving license: California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        ISS 08/31/2009 [/INST]

Generated Output:
{"License_Class": "C", "License_Number": "11234568", "End": "NONE", "Country": "USA", "State": "California", "Class_Name": "VETERAN", "Driver_Name":