In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%env UNSLOTH_RETURN_LOGITS=1

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
prompts_to_test = [
    # --- In-Domain (Tamil) Prompts ---
    "திருக்குறளை எழுதியவர் யார்?",
    "பொங்கல் பண்டிகை பற்றி ஒரு பந்தி எழுதுக.",
    "கோவிட்-19 பற்றி 75-100 வார்த்தைகளுக்குள் ஒரு கட்டுரை தருக.",

    # --- Out-of-Domain (Sanity Check) Prompts ---
    "What is the capital of Japan?",
    "Write a short sentence about a computer."
]

print("--- Running 'Before Training' Evaluation ---")


for prompt in prompts_to_test:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=100)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Prompt: {prompt}")
    print(f"Response: {response}\n")
    print("-" * 30,"\n")

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

In [None]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    texts  = examples["text"]
    outputs = []
    for text in texts:
         if text:
            outputs.append(text + EOS_TOKEN)
    return { "text" : outputs, }
pass

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset


full_dataset = load_dataset("uonlp/CulturaX", "ta", split = "train")

print("Step 1: Carving a small slice from the full dataset...")
small_slice = full_dataset.train_test_split(train_size=0.02, shuffle=True, seed=42)["train"]


print("Step 2: Splitting the small slice into train and eval sets...")
final_split = small_slice.train_test_split(test_size=0.5, shuffle=True, seed=42) # Split it 50/50

train_dataset = final_split["train"]
eval_dataset  = final_split["test"]


print("Formatting datasets...")
train_dataset = train_dataset.map(formatting_prompts_func, batched = True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched = True)

print(f"Final training set size: {len(train_dataset)}")
print(f"Final evaluation set size: {len(eval_dataset)}")

In [None]:
from transformers import TrainingArguments
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(

    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,

    evaluation_strategy = "steps",
    eval_steps = 50,
    save_strategy = "steps",
    save_steps = 50,
    load_best_model_at_end = True,


    args = UnslothTrainingArguments(
         remove_unused_columns = False,
        output_dir = "/content/drive/MyDrive/My_CPT_Checkpoints",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        max_steps = 120,
        warmup_steps = 10,
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        logging_steps = 10,
        report_to = "none",
    ),
)

print("Trainer initialized successfully.")

In [None]:
num_training_iterations = 5

for i in range(num_training_iterations):
    print(f"--- Starting Training Iteration {i+1}/{num_training_iterations} ---")

    print(f"Creating a new random data subset with seed={i}...")
    dataset_subset = full_dataset.train_test_split(train_size=0.01, seed=i)["train"]

    dataset_subset = dataset_subset.remove_columns(["url", "timestamp", "source"])


    trainer = UnslothTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset_subset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 4,

        args = UnslothTrainingArguments(
            output_dir = "/content/drive/MyDrive/My_CPT_Checkpoints",
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 8,
            max_steps = 120,
            warmup_steps = 10,
            learning_rate = 5e-5,
            embedding_learning_rate = 1e-5,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            logging_steps = 10,
            remove_unused_columns = False,
            report_to = "none",
        ),
    )


    should_resume = True if i > 0 else False

    print(f"Starting trainer.train() with resume_from_checkpoint={should_resume}")
    trainer.train(resume_from_checkpoint = should_resume)

print("--- Completed all training iterations! ---")

In [None]:

cpt_model_path = "/content/drive/MyDrive/My_Models/CPT_Model_Iter1"


model.save_pretrained(cpt_model_path)

tokenizer.save_pretrained(cpt_model_path)

print("Model saved successfully to Google Drive!")

In [None]:
import torch
import gc

# 1. Delete the old trainer object that is holding onto the optimizer states
del trainer
print("Trainer object deleted.")

# 2. Run the Python garbage collector to free up CPU memory
gc.collect()
print("Garbage collection complete.")

# 3. Clear the GPU's memory cache
torch.cuda.empty_cache()
print("CUDA cache cleared. GPU memory should now be freed.")

In [None]:

def tokenization_func(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_seq_length, padding="max_length")
tokenized_eval_dataset = eval_dataset.map(tokenization_func, batched=True)


print("Initializing a new trainer for evaluation...")
trainer_for_eval = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_eval_dataset,
    args = UnslothTrainingArguments(
        output_dir="temp_eval_dir",
        per_device_eval_batch_size = 1,
        remove_unused_columns=False,
    ),
)

print("Evaluating...")
final_metrics = trainer_for_eval.evaluate(eval_dataset=tokenized_eval_dataset)
final_perplexity = final_metrics['eval_perplexity']
print(f"\nFinal Perplexity after CPT: {final_perplexity:.4f}")



In [None]:
print("[Qualitative Analysis]")
prompts_to_test = [
    "திருக்குறளை எழுதியவர் யார்?",
    "பொங்கல் பண்டிகை பற்றி ஒரு பந்தி எழுதுக.",
    "கோவிட்-19 பற்றி 75-100 வார்த்தைகளுக்குள் ஒரு கட்டுரை தருக.",
    "What is the capital of Japan?",
    "Write a short sentence about a computer."
]

for prompt in prompts_to_test:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=50, use_cache=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_only = response[len(prompt):].strip()

    print(f"Prompt: {prompt}")
    print(f"Answer: {answer_only}")
    print("-" * 25)

In [None]:
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN2"))
api.upload_folder(
    folder_path="/content/drive/MyDrive/My_Models/CPT_Model_Iter1",
    repo_id="iCIIT/ThamizhiLLM",
    repo_type="model",
)
