<a href="https://colab.research.google.com/github/SURESHBEEKHANI/AI-Quick-Summaries/blob/main/mistral_7b_finetuning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate peft bitsandbytes trl py7zr auto-gptq optimum transformers

In [None]:
!nvidia-smi

### **Step 1: Setup and Import Libraries**

In [None]:
# Import necessary libraries for Hugging Face transformers, datasets, and PEFT
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

### **Step 2: Load and Preprocess the Dataset**

In [None]:
# Load the SAMSum dataset
df = load_dataset("samsum", split="train")  # Load the training split of the SAMSum dataset
data_df = df.to_pandas()                    # Convert the dataset to a Pandas DataFrame

# Randomly sample 7361 rows (or adjust this number based on available resources)
data = data_df.sample(7361)

# Combine dialogue and summary into a single text column formatted for supervised fine-tuning
data["text"] = data[["dialogue", "summary"]].apply(
    lambda x: f"###Human: Summarize this dialogue:\n{x['dialogue']}\n###Assistant: {x['summary']}",
    axis=1
)

# Convert the Pandas DataFrame back to a Hugging Face Dataset
data = Dataset.from_pandas(data)


### **Step 3: Load the Pre-trained Model and Tokenizer**

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ",
    padding_side="left",        # Left padding for causal language modeling
    add_eos_token=True,         # Add EOS token for sequence termination
    add_bos_token=True          # Add BOS token for sequence start
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

# Prepare the model for low-bit (k-bit) fine-tuning
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ",
    device_map="auto",
    offload_folder="./offload",  # Directory for offloaded weights
    offload_state_dict=True,    # Offload state dictionaries to reduce GPU memory
)
model = prepare_model_for_kbit_training(model)  # Prepare the model for PEFT


### **Step 4: Configure LoRA (Low-Rank Adaptation)**

In [None]:
# Define LoRA configuration for efficient fine-tuning
peft_config = LoraConfig(
    r=16,                      # Rank of the low-rank matrices
    lora_alpha=16,             # Scaling factor for LoRA
    lora_dropout=0.05,         # Dropout rate for LoRA layers
    bias="none",               # No biases in LoRA layers
    task_type="CAUSAL_LM",     # Task type: causal language modeling
    target_modules=["q_proj", "v_proj"]  # Apply LoRA to query and value projections in attention layers
)

# Apply the LoRA configuration to the model
model = get_peft_model(model, peft_config)


In [None]:
import torch

# Clear cache to prevent memory fragmentation
torch.cuda.empty_cache()

### **Step 5: Define Training Arguments**

In [None]:
from transformers import TrainingArguments, DataCollatorForSeq2Seq

# Update training arguments with a smaller batch size
training_arguments = TrainingArguments(
    output_dir="mistral-finetuned-samsum",  # Directory where the model checkpoints and outputs will be saved
    per_device_train_batch_size=4,          # Reduce batch size to 4 or lower
    gradient_accumulation_steps=1,          # Number of steps to accumulate gradients before performing a backward pass
    optim="paged_adamw_32bit",              # Optimizer to be used for training
    learning_rate=2e-4,                     # Learning rate for the optimizer
    lr_scheduler_type="cosine",             # The type of learning rate scheduler
    save_strategy="epoch",                  # The strategy to save model checkpoints
    logging_steps=100,                      # Number of steps between each logging event
    num_train_epochs=1,                     # The total number of training epochs
    max_steps=250,                          # Maximum number of training steps
    fp16=True                               # Enable mixed-precision training (float16)
)

# Define a data collator to handle padding during training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Tokenize the dataset with a max length
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

# Apply tokenization to the dataset
train_dataset = data.map(tokenize_function, batched=True)

### **Step 6: Initialize the Trainer**

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    args=training_arguments,
    tokenizer=tokenizer,
    data_collator=data_collator
)

### **Step 7: Train the Model**

In [None]:
# Start the training process
trainer.train()