In [1]:
!pip install transformers datasets trl peft bitsandbytes accelerate torch sentencepiece huggingface_hub -U
!pip install --upgrade trl transformers accelerate
!pip install --upgrade deepspeed==0.14.4

# ATTENTION: Be sure to restart the notebook after installing all packages!

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metada

In [1]:
from huggingface_hub import interpreter_login
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



In [3]:
# 1. Import necessary libraries
import os
import torch
from datasets import load_dataset, Dataset # Added Dataset for dummy data handling
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    HfArgumentParser # Optional: for more advanced argument parsing
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
import warnings # To suppress specific warnings if needed

print("loaded libs")

# Suppress specific warnings if they become noisy (optional)
# warnings.filterwarnings("ignore", category=FutureWarning, module="bitsandbytes")

# --- Configuration ---

# Model and Tokenizer configuration
base_model_id = "meta-llama/Llama-3.2-3B-Instruct" # The model we want to fine-tune
new_model_name = "llama-3.2-3b-instruct-finetuned" # Name for your fine-tuned model on Hugging Face Hub or locally

# Dataset configuration
# *** IMPORTANT: CHANGE THIS TO YOUR ACTUAL CSV FILE PATH ***
dataset_path = "topic_question_approach_trunc.csv"
# Define how your data is structured. We assume a 'text' column after formatting.
# If your CSV has 'prompt' and 'completion', the formatting function will create the 'text' column.
data_files = {"train": dataset_path}

# LoRA configuration (Parameter-Efficient Fine-Tuning)
lora_r = 16              # LoRA attention dimension (rank). Higher values train more parameters but require more memory.
lora_alpha = 32         # Alpha parameter for LoRA scaling. Often set to 2*lora_r.
lora_dropout = 0.05     # Dropout probability for LoRA layers to prevent overfitting.
lora_target_modules = [ # Modules to apply LoRA to. These depend on the model architecture.
                        # Common targets for Llama models are projection layers.
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
]

# Quantization configuration (QLoRA - Optional, requires bitsandbytes)
# Reduces memory usage significantly by loading the base model in 4-bit.
use_4bit = True         # Enable 4-bit quantization. Set to False if not using quantization.
bnb_4bit_compute_dtype = "bfloat16" # Compute dtype for 4-bit base models. bfloat16 recommended for Ampere+ GPUs for speed/accuracy. Use "float16" otherwise.
bnb_4bit_quant_type = "nf4"           # Quantization type. "nf4" (NormalFloat 4-bit) is recommended.
use_nested_quant = False              # Activate nested quantization for 4-bit base models. Can save more memory but might impact performance slightly.

# Training arguments
output_dir = "./results_llama3_finetuned" # Directory to save checkpoints and logs.
num_train_epochs = 2                # Number of complete passes through the training data. Start with 1-3.
per_device_train_batch_size = 2     # Batch size per GPU. Reduce if you encounter Out-Of-Memory (OOM) errors.
gradient_accumulation_steps = 4     # Accumulate gradients over X steps before updating model weights.
                                    # Effective batch size = batch_size * grad_accum * num_gpus. Helps fit larger batches in memory.
optim = "paged_adamw_32bit"         # Optimizer. Paged optimizers are recommended for QLoRA to manage memory efficiently.
save_strategy = "steps"             # Save checkpoints based on steps or epochs.
save_steps = 100                    # Save checkpoint every X updates steps. Adjust based on dataset size and training time.
logging_strategy = "steps"          # Log metrics based on steps or epochs.
logging_steps = 10                  # Log training info (loss, etc.) every X updates steps.
learning_rate = 2e-4                # Initial learning rate. AdamW optimizer adapts this. Common starting point for LoRA.
weight_decay = 0.001               # Weight decay for regularization.
fp16 = False                        # Enable mixed precision training with float16. Set to False if using bf16.
bf16 = True                         # Enable mixed precision training with bfloat16 (requires Ampere+ GPU). Generally preferred over fp16 if available.
max_grad_norm = 0.3                # Gradient clipping max norm. Helps prevent exploding gradients.
max_steps = -1                      # Maximum number of training steps. Overrides num_train_epochs if set > 0. Useful for time-limited training.
warmup_ratio = 0.03                # Proportion of training steps for linear learning rate warmup. Helps stabilize training early on.
group_by_length = True             # Group sequences of similar lengths into batches. Saves memory and speeds up training by minimizing padding.
lr_scheduler_type = "cosine"       # Learning rate scheduler type (e.g., "linear", "cosine"). Cosine annealing is common.
report_to = "tensorboard"          # Where to report metrics (e.g., "wandb", "tensorboard", "none"). Requires respective library installed.
# --- Hugging Face Hub Integration (Optional) ---
# push_to_hub = False # Set to True to push model adapter weights to the Hub after training
# hub_model_id = f"your_hf_username/{new_model_name}" # *** CHANGE 'your_hf_username' ***
# hub_token = None # Uses cached token by default if logged in (`huggingface-cli login`). Can set explicitly: "YOUR_HF_WRITE_TOKEN"

# --- Data Loading and Preprocessing ---

# Load the dataset from the CSV file
print(f"Loading dataset from: {dataset_path}")
try:
    # Ensure you specify the correct separator if it's not a comma (e.g., sep='\t' for TSV)
    # You might need to specify column names if the CSV doesn't have headers: names=['col1', 'col2']
    dataset = load_dataset("csv", data_files=data_files)
    # If your dataset is large, consider streaming: load_dataset("csv", data_files=data_files, streaming=True)
    print(f"Dataset loaded successfully: {dataset}")
    # Access the training split (assuming 'train' key from data_files)
    train_dataset = dataset['train']
    print(f"Dataset features: {train_dataset.features}")
    # Basic validation: Check if expected columns exist
    if 'prompt' not in train_dataset.features or 'completion' not in train_dataset.features:
         raise ValueError("CSV file must contain 'prompt' and 'completion' columns.")

except FileNotFoundError:
    print(f"Error: Dataset file not found at {dataset_path}.")
    print("Please ensure the path is correct and the file exists.")
    # Example of creating a dummy dataset for testing the script structure if needed:
    print("Creating a dummy dataset for demonstration purposes.")
    dummy_data = {
        'prompt': ["What is the capital of Pennsylvania?", "Explain LoRA fine-tuning."],
        'completion': ["The capital of Pennsylvania is Harrisburg.", "LoRA (Low-Rank Adaptation) is a parameter-efficient fine-tuning technique that injects trainable low-rank matrices into a pre-trained model's layers, allowing efficient adaptation without modifying the original weights."]
    }
    train_dataset = Dataset.from_dict(dummy_data)
    dataset = {'train': train_dataset} # Match structure
    print(f"Using dummy dataset: {dataset}")
    print(f"Dummy dataset features: {train_dataset.features}")
except ValueError as e:
     print(f"Error loading dataset: {e}")
     # Exit or handle error appropriately
     exit()
except Exception as e:
    print(f"An unexpected error occurred during dataset loading: {e}")
    # Exit or handle error appropriately
    exit()


# Define the formatting function based on Llama 3.2 Instruct's chat template
# *** CRITICAL STEP: Verify and adapt this function to the EXACT chat template ***
# *** required by meta-llama/Llama-3.2-3B-Instruct. Check the model card! ***
# The template usually involves specific tokens like <|begin_of_text|>, <|start_header_id|>, etc.
# Using the wrong format will significantly degrade performance.
# def format_dataset(example):
#     """
#     Formats a prompt-completion pair into the Llama 3.2 Instruct chat template.
#     Replace this with the correct template structure found on the model card.
#     """
#     prompt = example['prompt']
#     completion = example['completion']

#     # Example Llama 3.2 Instruct format (VERIFY THIS STRUCTURE!)
#     formatted_text = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

# {prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

# {completion}<|eot_id|>"""
#     return {"text": formatted_text}

# # Apply the formatting function to the dataset
# print("Formatting dataset according to chat template...")
# # Use batched=True for potentially faster processing
# formatted_dataset = train_dataset.map(format_dataset, batched=True)
# print("Dataset formatting complete.")
# # Optional: Remove original columns if they are no longer needed to save memory
# # formatted_dataset = formatted_dataset.remove_columns(['prompt', 'completion'])
# print(f"First example after formatting:\n{formatted_dataset[0]['text']}")

def format_dataset_batch(batch):
    """
    Formats a batch of prompt-completion pairs into the Llama 3.2 Instruct chat template.
    Accepts a batch (dict of lists) and returns a dict with a list of formatted strings.
    Replace this with the correct template structure found on the model card.
    """
    # The input 'batch' is a dictionary where keys are column names (e.g., 'prompt')
    # and values are lists of corresponding entries (e.g., [prompt1, prompt2, ...])
    prompts = batch['prompt']
    completions = batch['completion']
    formatted_texts = [] # Initialize an empty list to store formatted results

    # Iterate through the prompts and completions in the batch simultaneously
    for prompt, completion in zip(prompts, completions):
        # Example Llama 3.2 Instruct format (VERIFY THIS STRUCTURE!)
        formatted_text = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{completion}<|eot_id|>"""
        formatted_texts.append(formatted_text) # Add the formatted string to the list

    # Return a dictionary where the key is the new column name ('text')
    # and the value is the list of formatted strings for the entire batch
    return {"text": formatted_texts}

# Apply the formatting function to the dataset using batched processing
print("Formatting dataset according to chat template (using batch processing)...")
# Use batched=True for potentially faster processing, ensure the function handles batches
formatted_dataset = train_dataset.map(
    format_dataset_batch, # Use the batch-aware function
    batched=True,         # Process data in batches
    # Optional: remove original columns after processing to save memory
    # remove_columns=train_dataset.column_names
)
print("Dataset formatting complete.")
# Optional: Remove original columns if they are no longer needed to save memory
# formatted_dataset = formatted_dataset.remove_columns(['prompt', 'completion']) # Do this AFTER the map if not using remove_columns in map
print(f"First example after formatting:\n{formatted_dataset[0]['text']}")



# --- Model and Tokenizer Loading ---

print(f"Loading base model: {base_model_id}")

# Configure BitsAndBytes quantization if enabled
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = None
if use_4bit:
    print("Using 4-bit quantization (QLoRA)")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )
    # Check GPU compatibility with bfloat16 for 4-bit quantization
    if compute_dtype == torch.float16 and use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major < 8: # Ampere GPUs (major version 8) or newer support bfloat16
            print("=" * 80)
            print("Warning: Your GPU does not natively support bfloat16. Consider using float16.")
            print("Setting compute_dtype to float16 for compatibility.")
            print("=" * 80)
            bnb_config.bnb_4bit_compute_dtype = torch.float16
            bf16 = False # Disable bf16 training if not supported
            fp16 = True  # Enable fp16 training instead

# Load the tokenizer
# trust_remote_code=True is often required for custom model architectures or tokenizers
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, use_fast=True)

# Configure tokenizer padding
# Llama models typically don't have a dedicated pad token. Using EOS token for padding.
# Pad on the right side. Left padding can sometimes cause issues.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Tokenizer loaded and configured.")

# Load the base model
# device_map="auto" automatically distributes model layers across available GPUs (if more than one)
# or loads onto the single available GPU, or CPU if no GPU is detected.
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config, # Apply quantization config if use_4bit is True
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=compute_dtype if use_4bit else torch.bfloat16 if bf16 else torch.float16 # Set dtype for non-quantized parts
)

# Configure model settings for training
model.config.use_cache = False # Disable caching during training for efficiency
model.config.pretraining_tp = 1 # Set tensor parallelism degree (1 = no parallelism during fine-tuning)
print("Base model loaded.")


# --- PEFT Configuration (LoRA) ---

print("Configuring PEFT (LoRA)...")
# Prepare model for k-bit training if quantization is enabled.
# This handles necessary adjustments for training quantized models.
if use_4bit:
    model = prepare_model_for_kbit_training(model)
    print("Model prepared for k-bit training.")

# Define LoRA configuration
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none", # Typically set to 'none' for LoRA
    task_type="CAUSAL_LM", # Specify the task type
)

# Apply LoRA adapter layers to the model
model = get_peft_model(model, peft_config)
print("LoRA layers applied.")
model.print_trainable_parameters() # Display the number and percentage of trainable parameters


# --- Training Setup ---

print("Setting up Training Arguments...")
# Configure training arguments using the TrainingArguments class
training_arguments = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    save_steps=save_steps,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16, # Use float16 mixed precision if enabled
    bf16=bf16, # Use bfloat16 mixed precision if enabled and supported
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to=report_to,
    dataset_text_field="text",     # The name of the column in the dataset containing the formatted text
    packing=False,                 # Set to True to pack multiple short sequences into one sample. Can significantly speed up training but requires careful dataset preparation.
    # push_to_hub=push_to_hub, # Uncomment if pushing to Hub
    # hub_model_id=hub_model_id, # Uncomment if pushing to Hub
    # hub_token=hub_token,       # Uncomment if pushing to Hub
    # ddp_find_unused_parameters=False, # Set to False if encountering issues with Distributed Data Parallel (DDP)
    # dataloader_num_workers=4, # Optional: Increase number of workers for data loading if bottlenecked
)

print("Initializing SFTTrainer...")
# Initialize the Supervised Fine-tuning Trainer (SFTTrainer) from TRL
# SFTTrainer simplifies training on instruction/chat datasets.
trainer = SFTTrainer(
    model=model,                   # The PEFT-enhanced model
    train_dataset=formatted_dataset, # The preprocessed training dataset
    # eval_dataset=eval_dataset,   # Provide a preprocessed validation dataset for evaluation during training
    peft_config=peft_config,       # The PEFT configuration
    # dataset_text_field="text",     # The name of the column in the dataset containing the formatted text
    # max_seq_length=1024,           # Maximum sequence length for tokenization. Adjust based on VRAM and data. Sequences longer than this will be truncated.
    processing_class=tokenizer,           # The tokenizer
    args=training_arguments,       # The training arguments
    # packing=False,                 # Set to True to pack multiple short sequences into one sample. Can significantly speed up training but requires careful dataset preparation.
    # data_collator=data_collator, # Optional: Custom data collator if needed
)

# --- Start Training ---

print("Starting training process...")
# Checkpoint resuming: If output_dir contains checkpoints, training will resume from the latest one.
# Pass resume_from_checkpoint=True or specify a path resume_from_checkpoint="path/to/checkpoint" if needed explicitly.
train_result = trainer.train()
print("Training finished.")

# --- Save Model and Metrics ---

print("Saving final LoRA adapter weights...")
# Save the trained LoRA adapter weights (only the small adapter layers)
# The base model is NOT saved here, only the adapter.
trainer.save_model(output_dir)
print(f"LoRA adapter weights saved to {output_dir}")

# Save training metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state() # Saves optimizer state, scheduler state, etc.
print("Training metrics and state saved.")

# --- Optional: Merge LoRA weights and save full model ---
# This step combines the trained LoRA weights with the base model weights
# to create a standard full model checkpoint. Requires more memory and time.

# print("Merging LoRA weights into the base model (optional)...")
# from peft import AutoPeftModelForCausalLM

# # Clear memory first (important before loading the full model)
# del model
# del trainer
# torch.cuda.empty_cache()
# import gc
# gc.collect()

# try:
#     # Load the PEFT model from the saved adapter directory
#     # This automatically loads the base model specified in adapter_config.json
#     merged_model = AutoPeftModelForCausalLM.from_pretrained(
#         output_dir,
#         low_cpu_mem_usage=True, # Try to minimize CPU RAM usage during loading
#         return_dict=True,
#         torch_dtype=torch.bfloat16 if bf16 else torch.float16, # Load in the target precision
#         device_map="auto", # Load onto available device(s)
#         trust_remote_code=True
#     )

#     # Merge the LoRA layers with the base model
#     merged_model = merged_model.merge_and_unload()
#     print("LoRA weights merged.")

#     # Define path for the final merged model
#     merged_model_path = os.path.join(output_dir, "final_merged_model")

#     # Save the merged model and tokenizer
#     # Use safe_serialization=True for the recommended safetensors format
#     merged_model.save_pretrained(merged_model_path, safe_serialization=True)
#     tokenizer.save_pretrained(merged_model_path)
#     print(f"Full fine-tuned model saved to {merged_model_path}")

#     # Optional: Push merged model to Hub (if configured)
#     # if push_to_hub and hub_model_id:
#     #     print(f"Pushing merged model to Hugging Face Hub: {hub_model_id}")
#     #     # Ensure you are logged in (`huggingface-cli login`)
#     #     merged_model.push_to_hub(hub_model_id, token=hub_token, safe_serialization=True)
#     #     tokenizer.push_to_hub(hub_model_id, token=hub_token)
#     #     print("Merged model pushed to Hub.")

# except Exception as e:
#     print(f"Error during model merging and saving: {e}")
#     print("Skipping model merging. LoRA adapters are saved in the main output directory.")


print("Training pipeline finished successfully.")



loaded libs
Loading dataset from: topic_question_approach_trunc.csv
Dataset loaded successfully: DatasetDict({
    train: Dataset({
        features: ['topic', 'prompt', 'completion'],
        num_rows: 9148
    })
})
Dataset features: {'topic': Value(dtype='string', id=None), 'prompt': Value(dtype='string', id=None), 'completion': Value(dtype='string', id=None)}
Formatting dataset according to chat template (using batch processing)...
Dataset formatting complete.
First example after formatting:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Investigating the effects of temperature and pressure on the phase transitions of correlated electron materials.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

**Research Approach: Investigating the Effects of Temperature and Pressure on the Phase Transitions of Correlated Electron Materials**

---

### 1. Hypothesis
The phase transitions in correlated electron materials are strongly influenced by variations in temperature and 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded.
Configuring PEFT (LoRA)...
Model prepared for k-bit training.
LoRA layers applied.
trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511
Setting up Training Arguments...
Initializing SFTTrainer...


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training process...


  return fn(*args, **kwargs)


Step,Training Loss
10,1.4382
20,1.3641
30,1.1879
40,1.0463
50,0.9599
60,1.0769
70,0.9839
80,0.911
90,0.8978
100,0.8495


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training finished.
Saving final LoRA adapter weights...
LoRA adapter weights saved to ./results_llama3_finetuned
***** train metrics *****
  total_flos               = 178522231GF
  train_loss               =      0.7249
  train_runtime            =  3:40:40.05
  train_samples_per_second =       1.382
  train_steps_per_second   =       0.173
Training metrics and state saved.
Training pipeline finished successfully.
