<a href="https://colab.research.google.com/github/Varsha-Jeyaraj/SharedTask_IIT_2025/blob/main/Thamizhi_FT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%env UNSLOTH_RETURN_LOGITS=1

In [None]:
from unsloth import FastLanguageModel
import torch
from google.colab import drive


print("Mounting Google Drive...")
drive.mount('/content/drive')


max_seq_length = 2048
dtype = None
load_in_4bit = True


cpt_model_path = "/content/drive/MyDrive/My_Models/CPT_Model_Iter1"

print(f"\nLoading the CPT model from: {cpt_model_path}")
print("This model has your trained LoRA adapters separate from the base model.")


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = cpt_model_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("\n--- CPT Model and Adapters Loaded Successfully! ---")

In [None]:
from datasets import load_dataset

sft_dataset = load_dataset("abhinand/tamil-alpaca", split="train")

In [None]:
print(sft_dataset[9])

In [None]:
_alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""
# Becomes:
alpaca_prompt = """
ஒரு பணியை விவரிக்கும் ஒரு வழிமுறை கீழே உள்ளது. கோரிக்கையை சரியாக பூர்த்தி செய்யும் பதிலை எழுதுங்கள்.

### பணி:
{}

### பதில்:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

sft_dataset = sft_dataset.map(formatting_prompts_func, batched = True,)

In [None]:

alpaca_prompt_template = """
ஒரு பணியை விவரிக்கும் ஒரு வழிமுறை கீழே உள்ளது. கோரிக்கையை சரியாக பூர்த்தி செய்யும் பதிலை எழுதுங்கள்.

### பணி:
{}

### உள்ளீடு:
{}

### பதில்:
{}"""


EOS_TOKEN = tokenizer.eos_token


def formatting_prompts_func_for_tamil_alpaca(example):
    instruction = example["instruction"]
    input_text  = example["input"]
    output      = example["output"]

    text = alpaca_prompt_template.format(instruction, input_text, output) + EOS_TOKEN

    return { "text": text }
pass


print("Formatting the SFT dataset...")
sft_dataset = sft_dataset.map(formatting_prompts_func_for_tamil_alpaca,)
print("Dataset formatted successfully.")


print("\nHere is an example of a formatted prompt:")
print(sft_dataset[0]['text'])

In [None]:
from transformers import TrainingArguments
from unsloth import UnslothTrainer, UnslothTrainingArguments

sft_output_dir = "/content/drive/MyDrive/My_Models/SFT_Checkpoints"

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = sft_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        save_strategy = "steps",
        save_steps = 50,
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,

        max_steps = 120,
        warmup_steps = 10,
        num_train_epochs = 5,

        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,
        fp16 = True,


        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        remove_unused_columns = False,
        output_dir = sft_output_dir,
        report_to = "none",
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
import torch

max_memory = round(torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024, 3)
start_gpu_memory = torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024


used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt_template.format(
        "தமிழ் இசை பற்றி 50-75 வார்த்தைகளுக்குள் ஒரு கட்டுரை தருக.", # instruction
        "",#input
        ""# output
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
final_model_path_drive = "/content/drive/MyDrive/My_Models/SFT_Model_Final_16bit"
print(f"\nSaving merged 16-bit model to Google Drive at: {final_model_path_drive}")
model.save_pretrained_merged(final_model_path_drive, tokenizer, save_method = "merged_16bit")
print("Save to Drive complete.")

In [None]:
from google.colab import userdata
hf_write_token = "HF_TOKEN2"


repo_id = "Thamizhi-Tamil-BaseModel-Parameters-FT"


final_model_path_drive = "/content/drive/MyDrive/My_Models/SFT_Model_Final_16bit"
model.save_pretrained_merged(final_model_path_drive, tokenizer, save_method = "merged_16bit")


print(f"Uploading merged model to Hugging Face Hub: {repo_id}")
model.push_to_hub_merged(repo_id, tokenizer, save_method = "merged_16bit", token = hf_write_token)
print("Upload to Hub complete!")