<a href="https://colab.research.google.com/github/RavinduPabasara/Alpaca-DistilGPT2-Sinhala-Finetuning/blob/main/SinhalaFineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers datasets torch accelerate bitsandbytes
!pip install -q wandb
!pip install -q gradient-accumulation

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompat

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset
import json
import pandas as pd
import numpy as np
from torch.utils.data import Dataset as TorchDataset
import gc
from torch.cuda.amp import autocast
import os

In [4]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [5]:
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()


In [6]:
class SinhalaDataset(TorchDataset):
    def __init__(self, texts, tokenizer, max_length=256):
        batch_size = 32
        self.encodings = {'input_ids': [], 'attention_mask': []}

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_encodings = tokenizer(
                batch_texts,
                truncation=True,
                max_length=max_length,
                padding='max_length',
                return_tensors='pt'
            )
            self.encodings['input_ids'].append(batch_encodings['input_ids'])
            self.encodings['attention_mask'].append(batch_encodings['attention_mask'])

        self.encodings['input_ids'] = torch.cat(self.encodings['input_ids'])
        self.encodings['attention_mask'] = torch.cat(self.encodings['attention_mask'])

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}

In [7]:
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    formatted_texts = []
    for item in data:
        formatted_text = f"Instruction: {item['instruction']}\nInput: {item['input']}\nOutput: {item['output']}"
        formatted_texts.append(formatted_text)

    print(f"Loaded {len(formatted_texts)} examples from dataset")
    return formatted_texts

def train_model(dataset_path):
    # Initialize tokenizer and model
    model_name = "distilgpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token='<|endoftext|>')

    # Load model with optimizations
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )

    # Load dataset
    print("Loading dataset...")
    texts = load_dataset(dataset_path)

    # Create dataset
    print("Creating dataset...")
    dataset = SinhalaDataset(texts, tokenizer)

    # Split dataset
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    print(f"Training set size: {train_size}")
    print(f"Validation set size: {val_size}")

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./sinhala-gpt",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=200,
        save_steps=500,
        save_total_limit=2,
        fp16=True,
        dataloader_num_workers=2,
        remove_unused_columns=True,
        report_to="none",
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    )

    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()

    # Train
    print("Starting training...")
    try:
        trainer.train()
    except RuntimeError as e:
        print(f"Error during training: {e}")
        clear_gpu_memory()
        raise e

    # Save model
    print("Saving model...")
    model.save_pretrained("./sinhala-gpt-final")
    tokenizer.save_pretrained("./sinhala-gpt-final")

    clear_gpu_memory()

def test_model(instruction, input_text):
    clear_gpu_memory()

    model = AutoModelForCausalLM.from_pretrained(
        "./sinhala-gpt-final",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )
    tokenizer = AutoTokenizer.from_pretrained("./sinhala-gpt-final")

    prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
    inputs = inputs.to('cuda')

    with autocast():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=150,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    del model, tokenizer, inputs, outputs
    clear_gpu_memory()

    return response

In [None]:
DATASET_PATH = '/content/drive/MyDrive/your_dataset.json'

In [8]:
DATASET_PATH = '/content/alpaca-sinhala.json'

In [9]:
clear_gpu_memory()
try:
    train_model(DATASET_PATH)
    print("Training completed successfully!")
except Exception as e:
    print(f"An error occurred: {e}")
    clear_gpu_memory()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading dataset...
Loaded 49741 examples from dataset
Creating dataset...
Training set size: 44766
Validation set size: 4975




Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


An error occurred: Attempting to unscale FP16 gradients.


In [10]:


# Modified training configuration - paste this in a new cell
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
import gc

def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()

def train_model(dataset_path):
    # Initialize tokenizer and model
    model_name = "distilgpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token='<|endoftext|>')

    # Modified model loading with 32-bit precision
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # Changed to float32
        low_cpu_mem_usage=True
    )

    # Load dataset
    print("Loading dataset...")
    texts = load_dataset(dataset_path)

    # Create dataset with reduced max_length
    print("Creating dataset...")
    dataset = SinhalaDataset(texts, tokenizer, max_length=128)  # Reduced max_length

    # Split dataset
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    print(f"Training set size: {train_size}")
    print(f"Validation set size: {val_size}")

    # Modified training arguments
    training_args = TrainingArguments(
        output_dir="./sinhala-gpt",
        num_train_epochs=3,
        per_device_train_batch_size=1,  # Reduced batch size
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=32,  # Increased gradient accumulation
        warmup_ratio=0.1,  # Changed to ratio instead of steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        eval_steps=500,
        save_steps=1000,
        save_total_limit=2,
        fp16=False,  # Disabled fp16 training
        dataloader_num_workers=2,
        remove_unused_columns=True,
        report_to="none",
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        gradient_checkpointing=True,  # Enabled gradient checkpointing
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    )

    # Train
    print("Starting training...")
    try:
        trainer.train()

        # Save model
        print("Saving model...")
        model.save_pretrained("./sinhala-gpt-final")
        tokenizer.save_pretrained("./sinhala-gpt-final")

    except Exception as e:
        print(f"Error during training: {e}")
        raise e
    finally:
        clear_gpu_memory()

In [11]:
# Run training
clear_gpu_memory()
try:
    train_model(DATASET_PATH)
    print("Training completed successfully!")
except Exception as e:
    print(f"An error occurred: {e}")
    clear_gpu_memory()


Loading dataset...
Loaded 49741 examples from dataset
Creating dataset...
Training set size: 44766
Validation set size: 4975
Starting training...




Step,Training Loss,Validation Loss
500,0.8479,0.747182
1000,0.6717,0.604113
1500,0.6005,0.547798
2000,0.5563,0.511994
2500,0.536,0.494826
3000,0.5187,0.480474
3500,0.5113,0.474898
4000,0.5097,0.469527


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Saving model...
Training completed successfully!


In [12]:
# Fifth cell - Test the model
test_cases = [
    {
        "instruction": "අක්ෂර වින්‍යාසය සහ ව්‍යාකරණ වැරදි සඳහා මෙම වාක්‍යය ඇගයීමට ලක් කරන්න",
        "input": "ඔහු තම ආහාර වේල සකසා ආපනශාලාවෙන් පිටව ගියේය"
    }
]

for test_case in test_cases:
    response = test_model(test_case["instruction"], test_case["input"])
    print("\nTest Case:")
    print("Instruction:", test_case["instruction"])
    print("Input:", test_case["input"])
    print("Generated Response:", response)


  with autocast():
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


ValueError: Input length of input_ids is 256, but `max_length` is set to 150. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [14]:
def generate_attention_mask(input_text):
    """Generate an attention mask based on input length."""
    input_ids = tokenizer.encode(input_text, return_tensors="pt").squeeze()
    return [1] * len(input_ids)


SyntaxError: incomplete input (<ipython-input-20-40a394ab8346>, line 3)

In [18]:
def test_model(instruction, input_text):
    clear_gpu_memory()

    model = AutoModelForCausalLM.from_pretrained(
        "./sinhala-gpt-final",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )
    tokenizer = AutoTokenizer.from_pretrained("./sinhala-gpt-final")

    prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
    inputs = inputs.to('cuda')

    try:
        with autocast("cuda"):
            outputs = model.generate(
                inputs["input_ids"],
                max_length=150,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
            )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during model generation: {e}")
        response = None
    finally:
        # Cleanup only if outputs exist
        if 'outputs' in locals():
            del outputs
        if 'model' in locals():
            del model
        if 'tokenizer' in locals():
            del tokenizer
        if 'inputs' in locals():
            del inputs

        clear_gpu_memory()

    return response


In [21]:
from torch.amp import autocast

def test_model(instruction, input_text):
    clear_gpu_memory()

    model = AutoModelForCausalLM.from_pretrained(
        "./sinhala-gpt-final",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )
    tokenizer = AutoTokenizer.from_pretrained("./sinhala-gpt-final")

    prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
    inputs = inputs.to('cuda')

    try:
        # Use autocast without string arguments
        with autocast():
            outputs = model.generate(
                inputs["input_ids"],
                max_length=150,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
            )

        # Decode the generated text
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during model generation: {e}")
        response = None
    finally:
        # Cleanup only if outputs exist
        if 'outputs' in locals():
            del outputs
        if 'model' in locals():
            del model
        if 'tokenizer' in locals():
            del tokenizer
        if 'inputs' in locals():
            del inputs

        clear_gpu_memory()

    return response


In [22]:
test_cases = [
    {
        "instruction": "අක්ෂර වින්‍යාසය සහ ව්‍යාකරණ වැරදි සඳහා මෙම වාක්‍යය ඇගයීමට ලක් කරන්න",
        "input": "ඔහු තම ආහාර වේල සකසා ආපනශාලාවෙන් පිටව ගියේය"
    }
]

for test_case in test_cases:
    response = test_model(test_case["instruction"], test_case["input"])
    print("\nTest Case:")
    print("Instruction:", test_case["instruction"])
    print("Input:", test_case["input"])
    print("Generated Response:", response)


Error during model generation: autocast.__init__() missing 1 required positional argument: 'device_type'

Test Case:
Instruction: අක්ෂර වින්‍යාසය සහ ව්‍යාකරණ වැරදි සඳහා මෙම වාක්‍යය ඇගයීමට ලක් කරන්න
Input: ඔහු තම ආහාර වේල සකසා ආපනශාලාවෙන් පිටව ගියේය
Generated Response: None


In [23]:
!cp -r ./sinhala-gpt-final /content/drive/MyDrive/