<a href="https://colab.research.google.com/github/SURESHBEEKHANI/Advanced-LLM-Fine-Tuning/blob/main/FineTuning_Mistral7B_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
# The `%%capture` magic in Jupyter/Colab captures output, suppressing it from being displayed.

# Install the `unsloth` package from PyPI
!pip install unsloth

# Uninstall `unsloth` to ensure a clean installation, then install the latest version from GitHub
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel  # Importing the FastLanguageModel class from the unsloth library
import torch  # Importing PyTorch for handling tensors and computations

# Set the maximum sequence length for the model's input
max_seq_length = 2048  # The maximum number of tokens the model can process in one sequence. Customize as needed.
# Note: The library internally supports RoPE (Rotary Position Embedding) scaling to handle long sequences.

# Set the data type for model computation
dtype = None  # Automatically detect the best precision.
# Set dtype to 'torch.float16' for Tesla T4/V100 GPUs, or 'torch.bfloat16' for Ampere and newer GPUs.

# Choose whether to use 4-bit quantization for the model
load_in_4bit = True  # Enabling 4-bit quantization reduces memory usage and speeds up computation.
# Set to False if higher precision is needed or memory is not a concern.

# Load the model and tokenizer using the FastLanguageModel class
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b-v0.3",  # Specify the model name to load. Replace with any model of your choice.
    max_seq_length=max_seq_length,  # Pass the chosen maximum sequence length.
    dtype=dtype,  # Pass the chosen data type for computations.
    load_in_4bit=load_in_4bit,  # Pass whether to use 4-bit quantization.
)

# Explanation:
# - `FastLanguageModel.from_pretrained` is a convenient method to load both the model and tokenizer.
# - `model_name`: The name of the pre-trained model. Example: "unsloth/mistral-7b-v0.3".
# - `max_seq_length`: Configures the maximum token length the model can handle in one input.
# - `dtype`: Allows precise control over computation precision for optimal performance on different hardware.
# - `load_in_4bit`: If True, enables 4-bit quantization to reduce memory footprint while maintaining good accuracy.


In [None]:
# Configure the model with PEFT (Parameter-Efficient Fine-Tuning) settings using LoRA (Low-Rank Adaptation)
model = FastLanguageModel.get_peft_model(
    model,  # The base model to be fine-tuned using PEFT techniques

    # Low-Rank Adaptation (LoRA) rank
    r=16,  # Defines the rank of the low-rank matrices. Common choices: 8, 16, 32, 64, 128.
    # Larger values increase expressiveness but require more memory.

    # Modules to target for LoRA fine-tuning
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention projection layers
        "gate_proj", "up_proj", "down_proj",     # MLP layers
    ],
    # Only these specified modules will be fine-tuned to reduce memory and computational overhead.

    # LoRA-specific hyperparameters
    lora_alpha=16,  # Scaling factor for LoRA weights. Balances new and pre-trained weights.
    lora_dropout=0,  # Dropout rate for LoRA. Setting to 0 often gives optimized performance.

    # Bias handling in fine-tuning
    bias="none",  # Specifies bias tuning. "none" is optimized for performance. Alternatives: "all", "lora_only".

    # Optimizations for VRAM and context length
    use_gradient_checkpointing="unsloth",  # Use gradient checkpointing to save memory during training.
    # The "unsloth" setting reduces VRAM usage by ~30%, allowing larger batch sizes or longer contexts.

    # Random seed for reproducibility
    random_state=3407,  # Ensures the results are reproducible across runs.

    # Advanced fine-tuning features
    use_rslora=False,  # Enables Rank-Stabilized LoRA (rSLoRA) if set to True. Useful for stability in high ranks.
    loftq_config=None,  # Configures LoftQ (Low Overhead Fine-Tuning Quantization), if used. Set to None for default.
)


In [None]:
# Define a prompt template for generating text responses in the Alpaca format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Define the End-Of-Sequence token for the tokenizer
EOS_TOKEN = tokenizer.eos_token  # Ensures the model recognizes the end of the prompt and stops generating further text.

# Function to format examples using the Alpaca prompt template
def formatting_prompts_func(examples):
    # Extract instructions, inputs, and outputs from the examples
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    # List to store the formatted prompts
    texts = []

    # Iterate over instructions, inputs, and outputs
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Format the instruction, input, and output into the Alpaca prompt
        # Append EOS_TOKEN to ensure the model generation stops at the end
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)  # Add the formatted text to the list

    # Return the list of formatted texts as a dictionary for compatibility with the dataset API
    return {"text": texts}

# Load the dataset
from datasets import load_dataset
dataset = load_dataset("samsum", split="train")  # Load the "samsum" dataset, using the training split.

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
# The `batched=True` argument processes multiple examples at once, improving efficiency.
