## Link For The File (Kaggle):

https://www.kaggle.com/code/rameshbabuash/unsloth2-uncompiled-kaggle

# Install Dependencies
Installs core libraries needed for training, tokenization, and data handling.


In [1]:
%%capture
!pip install --no-deps bitsandbytes==0.45.3 accelerate==1.4.0 peft==0.14.0 triton==3.2.0 trl==0.15.2
!pip install sentencepiece==0.2.0 protobuf==5.29.3 datasets==3.3.2 huggingface-hub==0.29.1 hf_transfer==0.1.9
!pip install rich==13.9.4 psutil==7.0.0 safetensors==0.5.3
!pip install --upgrade transformers

# GPU Info
Prints the number of GPUs and their names.


In [2]:
import torch

# This prints the number of GPUs available
num_gpus = torch.cuda.device_count()
print("Number of GPUs available:", num_gpus)

# Print the name of each GPU
for i in range(num_gpus):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")


Number of GPUs available: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


# Distributed Training Script: unsloth.py

This script sets up distributed training using Accelerate and FSDP2. It:
- Sets up environment variables.
- Initializes and cleans up the distributed process group.
- Loads a quantized model (Meta-Llama-3.1-8B) and applies LoRA.
- Loads a dataset and trains using SFTTrainer.


In [3]:
%%writefile /kaggle/working/unsloth.py
import os
import warnings
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from peft import get_peft_model, LoraConfig, TaskType
from accelerate import Accelerator

# Suppress warnings
warnings.filterwarnings("ignore")

# Environment Variables
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
# Remove unsupported expandable_segments option to avoid warnings.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "roundup_power2_divisions:[32:256,64:128,256:64,>:32]"
# Optionally suppress XLA warnings.
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

def setup_distributed():
    if torch.distributed.is_available() and not torch.distributed.is_initialized():
        torch.distributed.init_process_group(backend="nccl")
    if torch.distributed.is_initialized():
        local_rank = int(os.environ.get("LOCAL_RANK", "0"))
        # Use a list for device_ids as required.
        torch.distributed.barrier(device_ids=[local_rank])

def cleanup_distributed():
    if torch.distributed.is_initialized():
        torch.distributed.destroy_process_group()

def main():
    # Setup distributed training environment.
    setup_distributed()

    # Set the seed for reproducibility.
    set_seed(42)
    
    model_name = "unsloth/Meta-Llama-3.1-8B"

    # Set default data type and define quantization configuration.
    torch.set_default_dtype(torch.float16)
    dtype = torch.float16
    bnb_config = BitsAndBytesConfig(
        load_in_4bit              = True,
        bnb_4bit_use_double_quant = True,
        bnb_4bit_quant_type       = "nf4",
        bnb_4bit_compute_dtype    = dtype,
        bnb_4bit_quant_storage    = torch.float16
    )

    # Initialize Accelerator for device mapping.
    accelerator = Accelerator()
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        attn_implementation = "sdpa",
        quantization_config = bnb_config,
        device_map={"": accelerator.process_index}
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Configure LoRA for parameter-efficient fine-tuning.
    lora_config = LoraConfig(
        r = 64,
        lora_alpha = 128,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj"],
        lora_dropout = 0,
        bias = "none",
        task_type = TaskType.CAUSAL_LM,
    )

    # Apply LoRA to the model and freeze non-LoRA parameters.
    model = get_peft_model(model, lora_config)
    with torch.no_grad():
        for name, param in model.named_parameters():
            if ".lora_A." in name or ".lora_B." in name:
                param.requires_grad_(True)
            else:
                param.requires_grad_(False)

    model.config.use_cache = False
                
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    
    # Get dataset
    url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
    dataset = load_dataset("json", data_files={"train": url}, split="train[:10%]")

    # Create the SFTTrainer for training the model.
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        processing_class=tokenizer,
        args=SFTConfig(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=1,
            max_steps=100,
            logging_steps=1,
            output_dir="outputs",
            seed=3407,
            max_seq_length=2048,
            fp16=model.get_input_embeddings().weight.dtype == torch.float16,
            bf16=model.get_input_embeddings().weight.dtype == torch.bfloat16,
            report_to="none",  # For W&B
            dataset_num_proc=4,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": True},
            label_names = ["input_ids", "labels", "attention_mask"]
        ),
    )
    
    accelerator.print(f"Model Summary:\n{trainer.model}")
    
    # Optionally print trainable parameters if the method is available.
    if hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()
    
    # Begin training.
    trainer.train()
    
    # Clean up distributed resources.
    cleanup_distributed()

if __name__ == "__main__":
    main()


Writing /kaggle/working/unsloth.py


# Accelerate Configuration (config.yaml)

This configuration file sets up distributed training using FSDPv2 with the following key settings:
- **compute_environment:** Running on a local machine.
- **distributed_type:** Using Fully Sharded Data Parallel (FSDP2) for efficient distributed training.
- **fsdp_config:** Detailed options for FSDP2, such as wrapping policy, prefetching, parameter offloading, and sharding strategy.
- **machine_rank, num_processes:** Define the rank and the number of processes (GPUs) per machine.
- **TPU and CPU Settings:** TPU options are disabled, and the training will run on GPUs.

Each setting is annotated with inline comments in the YAML file.


In [4]:
%%writefile /kaggle/working/config.yaml
compute_environment: LOCAL_MACHINE  # Running on a local machine
debug: false                        # Debug mode disabled
distributed_type: FSDP              # Use Fully Sharded Data Parallel for distributed training version 2
downcast_bf16: 'no'                 # Do not downcast BF16 precision
fsdp_config:                        # FSDP-specific configuration
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP  # Automatically wrap transformer modules for FSDP
  fsdp_backward_prefetch: BACKWARD_PRE             # Enable backward prefetching to optimize memory usage
  fsdp_cpu_ram_efficient_loading: true            # Enable efficient CPU RAM loading during FSDP initialization
  fsdp_forward_prefetch: false                     # Disable forward prefetching
  fsdp_offload_params: true                        # Offload parameters to CPU when not in use
  fsdp_sharding_strategy: FULL_SHARD               # Use full sharding strategy for optimal memory usage
  fsdp_state_dict_type: SHARDED_STATE_DICT         # Save state dict in a sharded format
  fsdp_sync_module_states: true                    # Synchronize module states across processes
  fsdp_use_orig_params: false                      # Do not use original parameters
machine_rank: 0                      # Rank of this machine in multi-machine setups
main_training_function: main         # Name of the main training function to execute
mixed_precision: 'no'                # Mixed precision training -- choose whatever you want
num_machines: 1                      # Total number of machines involved in training
num_processes: 2                     # Number of processes (GPUs) per machine
rdzv_backend: static                 # Use a static backend for rendezvous (process group setup)
same_network: true                   # All machines are on the same network
tpu_env: []                          # TPU environment settings (empty indicates TPU not used)
tpu_use_cluster: false               # Do not use a TPU cluster
tpu_use_sudo: false                  # TPU sudo access is not required
use_cpu: false                       # Training will run on GPUs, not on CPU


Writing /kaggle/working/config.yaml


# Launching Distributed Training with Accelerate

This cell uses the `accelerate` launcher to start the training script (`unsloth.py`) using the configuration specified in `config.yaml`.  
- The configuration file sets up distributed training with FSDP2 and GPU settings.
- The script is executed across the processes defined in the config.


In [5]:
!accelerate launch --config_file "/kaggle/working/config.yaml"  /kaggle/working/unsloth.py

2025-03-05 15:03:47.034655: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-05 15:03:47.034646: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-05 15:03:47.233961: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-05 15:03:47.233967: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-05 15:03:47.292500: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register fac