<a href="https://colab.research.google.com/github/Sidy3143/llm-projects/blob/main/Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q fsspec==2025.3.0 gcsfs transformers accelerate peft bitsandbytes datasets trl

In [None]:
!pip install -q wandb

In [None]:
import wandb
wandb.login()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
model_name = "gpt2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="bfloat16"
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0}
)

model.config.use_cache = False #no kv cache
model.config.pretraining_tp = 1

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"
model.config.pad_token_id = model.config.eos_token_id

In [None]:
print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

Model loaded: gpt2
Model parameters: 124,439,808


In [None]:
from datasets import load_dataset

ds = load_dataset("yahma/alpaca-cleaned")

In [None]:
ds['train'][0] #['instruction'] #format of the trainning examples

{'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.',
 'input': '',
 'instruction': 'Give three tips for staying healthy.'}

In [None]:
def process(example):
  instruction = example['instruction']
  input = example['input']
  output = example['output']

  if input.strip():
    prompt = (f"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{output}")
  else:
    prompt = (f"### Instruction:\n{instruction}\n\n### Response:\n{output}")

  return {'text': prompt}

formatted_data = ds.map(process, remove_columns=['input', 'output', 'instruction'])

In [None]:
formatted_data['train'][0]

{'text': '### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.'}

In [None]:
def tokenize_function(example):
  tokenized = tokenizer(example['text'],
                        truncation=True,
                        max_length=512,)

  return tokenized

tokenized_dataset = formatted_data.map(tokenize_function, batched=True, remove_columns=['text'])

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 51760
    })
})

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType, PeftModel

In [None]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "c_fc"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM, # or "CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 126,799,104 || trainable%: 1.8607


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from trl import SFTTrainer

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/gpt2-chat",

    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size =4,

    gradient_accumulation_steps=4,
    gradient_checkpointing=True,

    fp16=False,
    bf16=True,  # Use mixed precision training

    dataloader_pin_memory=True,

    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",

    max_grad_norm=0.3,
    weight_decay=0.001,
    warmup_ratio=0.03,
    learning_rate=2e-4,

    group_by_length=True,

    logging_steps=100,
    save_steps=1000,
    eval_steps=1000,

    remove_unused_columns=False,

    report_to="wandb",
    load_best_model_at_end=False,
)

In [None]:
split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.1, seed=43)
train_data = split_dataset['train']
eval_data = split_dataset['test']

In [None]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 46584
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 5176
    })
})

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    peft_config=lora_config,
    train_dataset=train_data,
    eval_dataset=eval_data,
)

In [None]:
print("\n=== Starting Training ===")
checkpoint_path = "/content/drive/MyDrive/gpt2-chat/checkpoint-2000" # for resuming
trainer.train(resume_from_checkpoint = checkpoint_path)


=== Starting Training ===




`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
2100,2.0786
2200,2.0524
2300,2.0589
2400,2.0489
2500,2.0539
2600,2.0416
2700,2.0428
2800,2.0651
2900,2.0376
3000,2.0817


TrainOutput(global_step=5823, training_loss=1.3310777126012872, metrics={'train_runtime': 2506.5698, 'train_samples_per_second': 37.17, 'train_steps_per_second': 2.323, 'total_flos': 9139313099390976.0, 'train_loss': 1.3310777126012872})

In [None]:
trainer.save_model(checkpoint_path)

In [None]:
from peft import PeftModel

In [None]:
#freeze the memory and reload the trained model
del model
del trainer

In [None]:
model_path = "/content/drive/MyDrive/gpt2-chat"
tokenizer_path = "/content/drive/MyDrive/gpt2-chat"

# Load the base model first (gpt2 in this case)
base_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

model = PeftModel.from_pretrained(base_model, model_path)

# Ensure the model is in evaluation mode
model.eval()

# Move the model to the CUDA device
import torch
device = torch.device("cuda")
model.to(device)



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
        

In [None]:
instruction = "Write a short story about a robot who learns to love."
prompt = (f"### Instruction:\n{instruction}\n\n### Response:")

In [None]:
import torch
device = torch.device("cuda")

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt")
input_ids = input_ids.to(device)

In [None]:
output = model.generate(input_ids['input_ids'], max_length=200,)

# Decode the generated output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### Instruction:
Write a short story about a robot who learns to love.

### Response:
As a young boy, I was fascinated by the beauty and wonder of robots. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their successes. I loved the way they could learn from their mistakes and learn from their succes