In [None]:
!pip install trl peft datasets huggingface_hub bitsandbytes accelerate wandb

Login in to huggingface

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGINGFACE_API_KEY")
from huggingface_hub import login
token = secret_value_0
login(token=token)

Importing libraries

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
import torch

Define dataset name 

In [None]:
dataset_name = "oscar"  # OSCAR dataset with multilingual support
language = "unshuffled_deduplicated_ta"  # Language code for Tamil

loading dataset

In [None]:
dataset = load_dataset(dataset_name, f"{language}", split="train[:1000]",trust_remote_code=True)  # Subset of the Tamil dataset

print the dataset samples

In [None]:
#  Dataset Length
dataset_length = len(dataset)
print(f"Length of the dataset: {dataset_length}")
#  Printing a few samples
for i in range(2):
    print(f"Sample {i + 1}: {dataset[i]['text']}")

This config enables memory-efficient 4-bit quantization for faster training.

In [None]:
from transformers import BitsAndBytesConfig

# Set the BitsAndBytesConfig for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Use mixed precision (FP16) during training
    bnb_4bit_use_double_quant=True,  # Enable double quantization for memory efficiency
    bnb_4bit_quant_type="nf4"  # NF4 quantization type
)

base_model = "google/gemma-2-2b"  
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    quantization_config=bnb_config 
)

Disables caching and enables gradient checkpointing to save memory.

In [None]:
model.config.use_cache = False
model.gradient_checkpointing_enable()

Initializes the tokenizer, sets padding side, and calculates token lengths.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
token_lengths = [len(tokenizer(text['text'], truncation=True)['input_ids']) for text in dataset]

Plotting

In [None]:
import matplotlib.pyplot as plt
plt.hist(token_lengths, bins=30, color='blue')
plt.title('Token Length Distribution')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.show()

The specified training parameters configure the model's training process effectively. **`per_device_train_batch_size=16`** sets the batch size for each GPU, while **`gradient_accumulation_steps=16`** allows for accumulating gradients over 16 batches to simulate a larger batch size without exceeding memory limits. The **`optim="adamw_torch"`** specifies the AdamW optimizer, which improves convergence with weight decay to prevent overfitting (**`weight_decay=0.01`**). Checkpoints are saved every 100 steps (**`save_steps=100`**) with a limit of 2 saved checkpoints (**`save_total_limit=2`**). Logging is performed every 10 steps (**`logging_steps=10`**), and evaluation occurs at the end of each epoch (**`eval_strategy="epoch"`**). A cosine learning rate schedule (**`lr_scheduler_type="cosine"`**) and 500 warmup steps (**`warmup_steps=500`**) stabilize training, while **`fp16=False`** and **`bf16=True`** optimize performance with bfloat16 precision. **`gradient_checkpointing=True`** saves memory by reducing the storage of intermediate activations. Overall, these settings aim to balance efficiency, model performance, and resource management during training.

In [None]:
import os

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    output_dir="./gemma-2b-oscar-tamil-finetuned",
    logging_dir="./logs",
    num_train_epochs=50,  
    per_device_train_batch_size=16,
    gradient_accumulation_steps=16,  
    optim="adamw_torch",
    save_steps=100,  # Save after each epoch
    save_total_limit=2,  # Keep only 2 checkpoints
    logging_steps=10,
    eval_strategy="epoch",  # Evaluate after each epoch
    lr_scheduler_type="cosine",  # Cosine schedule for smooth learning rate decay
    warmup_steps=500,
    weight_decay=0.01,
    fp16=False, # For better accuracy 
    bf16=True,
    group_by_length=True,
    report_to=None, 
    gradient_checkpointing=True,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    max_grad_norm=0.3,
    push_to_hub=False  
)

The tokenize_function takes a batch of text examples and uses the tokenizer to convert them into token IDs, applying truncation to ensure that texts longer than 64 tokens are cut off, while shorter texts are padded to this maximum length. This standardization facilitates efficient training by ensuring consistent input sizes. The resulting tokenized dataset is created by mapping this function across the entire dataset. Additionally, the DataCollatorForLanguageModeling is instantiated to group the tokenized sequences into batches for training, with mlm=False indicating that a causal language modeling approach is used instead of masked language modeling. This setup prepares the data effectively for training a language model.

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'], truncation=True, padding="max_length", max_length=64  
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # We are doing causal LM, not masked LM
)

LoRA (Low-Rank Adaptation) is used in models to reduce the number of trainable parameters by introducing low-rank matrices into specific layers (e.g., projection layers in transformers). This approach makes fine-tuning large models more memory-efficient and faster, especially for tasks like language modeling, without retraining the entire model. PEFT (Parameter-Efficient Fine-Tuning) refers to methods like LoRA that enable tuning only a small subset of parameters, improving efficiency. It's used to minimize resource consumption while still achieving performance close to full fine-tuning, making it ideal for adapting large-scale models to new tasks with limited computational power.

In [None]:
model = prepare_model_for_kbit_training(model)
# LoRA configuration: 
lora_config = LoraConfig(
    r=64,  # Rank of the low-rank matrix
    lora_alpha=32,  # Scaling factor for LoRA typically start with alpha=r and go upto alpha=2r
    lora_dropout=0.1,  # Dropout to avoid overfitting 
    target_modules=['o_proj', 'q_proj', 'up_proj', 'v_proj', 'k_proj', 'down_proj', 'gate_proj'],  # Apply LoRA to  projection layers in the transformer
    task_type="CAUSAL_LM"  #For casual language modelling
)

# Apply LoRA to the model using PEFT(parameter Efficient Fine Tuning)
model = get_peft_model(model, lora_config)

Trainer to train the gemma

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset, 
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
)
trainer.train()

Saving the trained model

In [None]:
trainer.save_model("./gemma-2b-oscar-tamil-finetuned")

Plot training loss vs steps

In [None]:
loss_values = trainer.state.log_history
train_loss = [log['loss'] for log in loss_values if 'loss' in log]

plt.plot(train_loss, label='Training Loss')
plt.title('Training Loss Over Time')
plt.xlabel('Training Step')
plt.ylabel('Loss')
plt.legend()
plt.show()

This code loads a **fine-tuned language model** with **LoRA adapters**, moves it to the appropriate device (GPU or CPU), and tokenizes a **Tamil prompt**. It then generates a story continuation using the model, applying specific parameters like beam search and sampling to improve the diversity and creativity of the generated text. Finally, the output is decoded back into readable text and printed.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Base Model: The code loads the base model "google/gemma-2b" (or any other model name if specified). This model is pre-trained (not yet fine-tuned with your custom data). It loads it with bfloat16 precision, which is more efficient on certain hardware, especially for large models.
base_model_name = "google/gemma-2-2b"  
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.bfloat16)

# Load the LoRA weights
peft_model_path = "/kaggle/working/gemma-2b-oscar-tamil-finetuned"  # Replace with the actual path to your fine-tuned model
model = PeftModel.from_pretrained(base_model, peft_model_path)

# Move the model to the device and set to evaluation mode
model = model.to(device).eval()
model.half()
# Adjusted prompt for story continuation
prompt = '''
ஒரு காலத்தில், சின்ன கிராமத்தில் வசித்த ராமு பெரும் கனவுகள் கொண்ட சிறுவன். ஒரு நாள் அவன் ஒரு பெரிய பயணத்தை தொடங்கினான். காடு கடந்து ஒரு புதுமையான நகரத்தை கண்டான். அங்கே அவர் எதிர்கொண்டது...

இப்போது கதை தொடர்க.
'''

# Tokenize inputs and move to device
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
iinputs = {k: v.half() for k, v in inputs.items()}

# Generate the story with adjusted parameters
try:
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=512,
            num_beams=3,
            do_sample=True,
            temperature=0.6,
            top_k=30,
            top_p=0.95,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    # Decode and print the generated story
    generated_story = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Generated Story:")
    print(generated_story)

except RuntimeError as e:
    print(f"An error occurred during generation: {e}")