<a href="https://colab.research.google.com/github/SURESHBEEKHANI/Finetune-LLAMA-2-On-Your-DataSet-AutoTrain-From-Hugging-Face/blob/main/Fine_Tuning_LLM_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Install All the Required Packages

# Part 1: Import Libraries

In [None]:
# Import necessary libraries for model training and fine-tuning.
import torch  # For PyTorch functionality.
import pandas as pd  # For handling data in DataFrame format.
from datasets import Dataset  # For working with Hugging Face datasets.
from transformers import TrainingArguments, TextStreamer  # For training arguments and text streaming.
from unsloth.chat_templates import get_chat_template  # For applying chat templates.
from unsloth import FastLanguageModel, is_bfloat16_supported  # For loading and configuring the model.


#  Part 2: Define Model and Tokenizer

In [None]:
# Define the maximum sequence length for the model input.
max_seq_length = 2048  # Adjust based on your model's capabilities.

# Load the pre-trained model with 4-bit quantization for efficiency.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # Model name with 4-bit precision.
    max_seq_length=max_seq_length,  # Maximum sequence length for the model.
    load_in_4bit=True,  # Load the model in 4-bit precision to save memory.
    dtype=None,  # Use default data type.
)

# Prepare the model for parameter-efficient fine-tuning with LoRA adapters.
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Rank of LoRA adapters.
    lora_alpha=16,  # Scaling factor for LoRA adapters.
    lora_dropout=0,  # Dropout rate for LoRA adapters.
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    # Modules in the model where LoRA adapters will be applied.
    use_rslora=True,  # Use Rank-Stabilized LoRA.
    use_gradient_checkpointing="unsloth"  # Use gradient checkpointing.
)


# Part 3: Load Local Data

In [None]:
# Load local CSV file.
local_file_path = "path/to/your/local_file.csv"  # Update with your file path.
df = pd.read_csv(local_file_path)  # Load CSV file into a DataFrame.

# Convert DataFrame to Hugging Face dataset.
dataset = Dataset.from_pandas(df)


# Part 4: Apply Chat Template

In [None]:
# Apply a chat template to the dataset to structure the conversations.
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",  # Use the ChatML template.
)

# Function to apply the chat template to dataset examples.
def apply_template(examples):
    messages = examples["conversations"]  # Extract messages from the dataset.
    text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
    return {"text": text}  # Return formatted text.

# Apply the template function to the dataset.
dataset = dataset.map(apply_template, batched=True)


# Part 5: Set Up Training


In [None]:
# Define the training parameters for the model.
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

# Initialize the SFTTrainer with training parameters.
trainer = SFTTrainer(
    model=model,  # Model to be trained.
    tokenizer=tokenizer,  # Tokenizer used for encoding inputs.
    train_dataset=dataset,  # Dataset used for training.
    dataset_text_field="text",  # Field in the dataset that contains the text data.
    max_seq_length=max_seq_length,  # Maximum sequence length for training.
    dataset_num_proc=2,  # Number of processes for data processing.
    packing=True,  # Enable packing to handle variable-length sequences.
    args=TrainingArguments(
        learning_rate=3e-4,  # Learning rate for the optimizer.
        lr_scheduler_type="linear",  # Learning rate scheduler type.
        per_device_train_batch_size=8,  # Batch size per device.
        gradient_accumulation_steps=2,  # Number of steps to accumulate gradients.
        num_train_epochs=1,  # Number of epochs for training.
        fp16=not is_bfloat16_supported(),  # Use FP16 if BF16 is not supported.
        bf16=is_bfloat16_supported(),  # Use BF16 if supported.
        logging_steps=1,  # Log every step.
        optim="adamw_8bit",  # Optimizer to use.
        weight_decay=0.01,  # Weight decay for regularization.
        warmup_steps=10,  # Number of steps for learning rate warmup.
        output_dir="output",  # Directory to save model checkpoints.
        seed=0,  # Random seed for reproducibility.
    ),
)

# Start the training process.
trainer.train()


# Part 6: Save and Push Model

In [None]:
# Prepare the model for inference.
from transformers import TextStreamer

model = FastLanguageModel.for_inference(model)  # Convert model for faster inference.

# Define a test prompt and prepare it for the model.
messages = [
    {"from": "human", "value": "Is 9.11 larger than 9.9?"},  # Test prompt.
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")  # Prepare inputs and move them to GPU.

# Generate a response from the model.
text_streamer = TextStreamer(tokenizer)  # Stream the output text.
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)

# Save the trained model with adapters.
model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")  # Save in 16-bit precision.
model.push_to_hub_merged("mlabonne/FineLlama-3.1-8B", tokenizer, save_method="merged_16bit")  # Push to Hugging Face Hub.

# Convert and save the model in different quantization formats.
quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"]  # List of quantization methods.
for quant in quant_methods:
    model.push_to_hub_gguf("mlabonne/FineLlama-3.1-8B-GGUF", tokenizer, quant)  # Push quantized models to Hugging Face Hub.

# Print a message indicating successful fine-tuning and model uploading.
print("Congratulations, we fine-tuned a model from scratch and uploaded quants you can now use in your favorite inference engine.")
