<a href="https://colab.research.google.com/github/RanxduG/Translation_Transformer/blob/main/Colab_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install required packages testing

In [1]:
!pip install torch torchvision torchaudio
!pip install transformers datasets
!pip install matplotlib seaborn
!pip install sentencepiece



##Importing libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pickle
import os
from tqdm import tqdm
from datasets import load_dataset, Dataset
import pandas as pd
import requests
import json
from tqdm import tqdm
import random
from transformer import Transformer, get_device

In [3]:
# Set device
device = get_device()
print(f"Using device: {device}")

Using device: cuda


##Initializing Datasets

In [4]:
def download_and_prepare_data():
    """
    Try multiple dataset sources and prepare English-Sinhala translation data
    """
    english_sentences = []
    sinhala_sentences = []

    print("Attempting to load English-Sinhala translation datasets...")

    # Method 1: Try OPUS-100 dataset (most reliable)
    print("\n1. Trying OPUS-100 dataset...")
    try:
        # Load OPUS-100 English-Sinhala
        dataset = load_dataset("Helsinki-NLP/opus-100", "en-si", split="train")

        # Extract sentences
        for example in dataset:
            if len(example['translation']['en'].strip()) > 0 and len(example['translation']['si'].strip()) > 0:
                english_sentences.append(example['translation']['en'].strip())
                sinhala_sentences.append(example['translation']['si'].strip())

                # Limit to reasonable size for training
                if len(english_sentences) >= 50000:
                    break

        print(f"✓ Successfully loaded {len(english_sentences)} sentence pairs from OPUS-100")

    except Exception as e:
        print(f"✗ OPUS-100 failed: {e}")

        # Method 2: Try Alternative OPUS-100 approach
        print("\n2. Trying alternative OPUS-100 loading method...")
        try:
            dataset = load_dataset("opus100", "en-si")

            for split in ['train', 'validation', 'test']:
                if split in dataset:
                    for example in dataset[split]:
                        if len(example['translation']['en'].strip()) > 0 and len(example['translation']['si'].strip()) > 0:
                            english_sentences.append(example['translation']['en'].strip())
                            sinhala_sentences.append(example['translation']['si'].strip())

                            if len(english_sentences) >= 50000:
                                break
                    if len(english_sentences) >= 50000:
                        break

            print(f"✓ Successfully loaded {len(english_sentences)} sentence pairs from OPUS-100 (alt method)")

        except Exception as e2:
            print(f"✗ Alternative OPUS-100 failed: {e2}")

            # Method 3: Try NLPC-UOM dataset with different approach
            print("\n3. Trying NLPC-UOM sentence alignment dataset...")
            try:
                # Try to load the dataset with specific configuration
                dataset = load_dataset("NLPC-UOM/sentence_alignment_dataset-Sinhala-Tamil-English",
                                     trust_remote_code=True)

                # Process the data
                for split_name in dataset.keys():
                    split_data = dataset[split_name]
                    for example in split_data:
                        # Check different possible field names
                        english_text = None
                        sinhala_text = None

                        # Try different field combinations
                        if 'english' in example and 'sinhala' in example:
                            english_text = example['english']
                            sinhala_text = example['sinhala']
                        elif 'en' in example and 'si' in example:
                            english_text = example['en']
                            sinhala_text = example['si']
                        elif 'source' in example and 'target' in example:
                            english_text = example['source']
                            sinhala_text = example['target']

                        if english_text and sinhala_text:
                            if len(english_text.strip()) > 0 and len(sinhala_text.strip()) > 0:
                                english_sentences.append(english_text.strip())
                                sinhala_sentences.append(sinhala_text.strip())

                                if len(english_sentences) >= 20000:
                                    break

                    if len(english_sentences) >= 20000:
                        break

                print(f"✓ Successfully loaded {len(english_sentences)} sentence pairs from NLPC-UOM")

            except Exception as e3:
                print(f"✗ NLPC-UOM failed: {e3}")

                # Method 4: Try downloading from a direct source
                print("\n4. Trying to download from Tatoeba Challenge dataset...")
                try:
                    # This is a backup method using publicly available data
                    # You can replace this URL with any publicly available English-Sinhala dataset

                    # For now, let's create a more comprehensive sample dataset
                    english_sentences, sinhala_sentences = create_comprehensive_sample_dataset()
                    print(f"✓ Using comprehensive sample dataset with {len(english_sentences)} sentence pairs")

                except Exception as e4:
                    print(f"✗ Direct download failed: {e4}")

                    # Method 5: Fallback to basic sample
                    print("\n5. Using basic sample dataset...")
                    english_sentences, sinhala_sentences = create_basic_sample_dataset()
                    print(f"✓ Using basic sample dataset with {len(english_sentences)} sentence pairs")

    return english_sentences, sinhala_sentences

def create_comprehensive_sample_dataset():
    """
    Create a more comprehensive sample dataset for training
    """
    english_sentences = [
        # Basic greetings and common phrases
        "Hello", "Hi", "Good morning", "Good afternoon", "Good evening", "Good night",
        "How are you?", "I am fine", "Thank you", "You are welcome", "Please", "Excuse me",
        "Sorry", "Goodbye", "See you later", "Have a nice day", "Nice to meet you",

        # Personal information
        "What is your name?", "My name is John", "How old are you?", "I am 25 years old",
        "Where are you from?", "I am from Sri Lanka", "I am from America", "I live in Colombo",
        "What do you do?", "I am a student", "I am a teacher", "I work in an office",

        # Family and relationships
        "This is my family", "I have a brother", "I have a sister", "My father is a doctor",
        "My mother is a teacher", "I love my family", "We are friends", "He is my friend",
        "She is my colleague", "They are my parents",

        # Daily activities
        "I wake up at 6 AM", "I go to work", "I come home", "I eat breakfast",
        "I eat lunch", "I eat dinner", "I go to sleep", "I watch TV", "I read books",
        "I listen to music", "I play games", "I exercise", "I go shopping",

        # Food and drinks
        "I am hungry", "I am thirsty", "I want to eat", "I want to drink water",
        "The food is delicious", "I like rice", "I like curry", "I drink tea",
        "I drink coffee", "This is spicy", "This is sweet", "I am full",

        # Weather and time
        "What time is it?", "It is 3 o'clock", "Today is Monday", "Tomorrow is Tuesday",
        "It is sunny", "It is raining", "It is hot", "It is cold", "The weather is nice",
        "I like this weather", "What day is today?", "What is the date?",

        # Places and directions
        "Where is the hospital?", "Where is the school?", "Where is the market?",
        "Go straight", "Turn left", "Turn right", "It is near here", "It is far from here",
        "I want to go to the beach", "I want to go to the mountains", "I live in the city",

        # Shopping and money
        "How much does it cost?", "It is expensive", "It is cheap", "I want to buy this",
        "Do you have change?", "I need money", "Where is the bank?", "I want to pay",
        "Can I have a discount?", "This is too expensive", "I will take it",

        # Transportation
        "I go by bus", "I go by car", "I go by train", "I walk", "I ride a bicycle",
        "Where is the bus stop?", "When does the bus come?", "I missed the bus",
        "Call a taxi", "I need to go to the airport", "How long does it take?",

        # Education and work
        "I go to school", "I study hard", "I have an exam", "I passed the exam",
        "I failed the exam", "I graduated", "I work hard", "I am busy", "I am free",
        "I have a meeting", "I finished my work", "I am tired",

        # Health and feelings
        "I am sick", "I feel better", "I have a headache", "I have a fever",
        "I need to see a doctor", "I am happy", "I am sad", "I am angry",
        "I am excited", "I am worried", "I am surprised", "I am confused",

        # Technology and communication
        "I have a phone", "I use a computer", "I send an email", "I watch videos",
        "I use the internet", "My phone is broken", "I need to charge my phone",
        "Can you call me?", "I will call you later", "Send me a message",

        # Objects and descriptions
        "The book is on the table", "The car is red", "The house is big",
        "The room is small", "The food is hot", "The water is cold",
        "She is beautiful", "He is tall", "It is new", "It is old",
        "This is good", "This is bad", "It is clean", "It is dirty",

        # Actions and verbs
        "I can swim", "I can sing", "I can dance", "I can cook", "I can drive",
        "I cannot speak Sinhala well", "I am learning", "I understand", "I don't understand",
        "Can you help me?", "I will help you", "Let's go", "Wait for me", "Hurry up",

        # Numbers and counting
        "I have one book", "I have two sisters", "I have three friends",
        "I need four chairs", "I bought five apples", "There are six people",
        "I have seven days", "I work eight hours", "I sleep nine hours", "I have ten fingers"
    ]

    sinhala_sentences = [
        # Basic greetings and common phrases
        "ආයුබෝවන්", "හායි", "සුභ උදෑසනක්", "සුභ මධ්‍යාහ්නයක්", "සුභ සන්ධ්‍යාවක්", "සුභ රාත්‍රියක්",
        "ඔබ කෙසේද?", "මම හොඳයි", "ස්තූතියි", "ඔබට ස්තූතියි", "කරුණාකර", "සමාවන්න",
        "සමාවන්න", "ආයුබෝවන්", "ඉදිරියෙදී හමුවෙමු", "හොඳ දවසක් ගත කරන්න", "ඔබව හමුවීමට සතුටුයි",

        # Personal information
        "ඔබගේ නම මොකක්ද?", "මගේ නම ජෝන්", "ඔබට වයස කීයද?", "මට වයස 25යි",
        "ඔබ කොහෙන්ද?", "මම ශ්‍රී ලංකාවෙන්", "මම ඇමරිකාවෙන්", "මම කොළඹ ජීවත් වෙනවා",
        "ඔබ මොනවා කරනවාද?", "මම ශිෂ්‍යයෙක්", "මම ගුරුවරයෙක්", "මම කාර්යාලයක වැඩ කරනවා",

        # Family and relationships
        "මේ මගේ පවුල", "මට සහෝදරයෙක් ඉන්නවා", "මට සහෝදරියක් ඉන්නවා", "මගේ තාත්තා වෛද්‍යවරයෙක්",
        "මගේ අම්මා ගුරුවරියක්", "මම මගේ පවුලට ආදරෙයි", "අපි යාළුවෝයි", "ඔහු මගේ යාළුවා",
        "ඇය මගේ සහකාරිය", "ඔවුන් මගේ දෙමාපියන්",

        # Daily activities
        "මම උදේ 6ට අවදි වෙනවා", "මම වැඩට යනවා", "මම ගෙදර එනවා", "මම උදෑසන කෑම කනවා",
        "මම දිවා කෑම කනවා", "මම රාත්‍රී කෑම කනවා", "මම නිදාගන්නවා", "මම ටීවී බලනවා", "මම පොත් කියවනවා",
        "මම සංගීතය අහනවා", "මම ක්‍රීඩා කරනවා", "මම ව්‍යායාම කරනවා", "මම සාප්පු යනවා",

        # Food and drinks
        "මට කිඩෙන් හිතෙනවා", "මට තිරගැන්මක් හිතෙනවා", "මට කන්න ඕන", "මට වතුර බොන්න ඕන",
        "කෑම රසයි", "මට බත් කමතියි", "මට කරි කමතියි", "මම තේ බොනවා",
        "මම කෝපි බොනවා", "මේක ඇඹුල්", "මේක මිහිරි", "මම සම්පූර්ණයි",

        # Weather and time
        "වේලාව කීයද?", "වේලාව 3යි", "අද සඳුදා", "හෙට අඟහරුවාදා",
        "හිරු එළියෙන් ඉන්නවා", "වැස්ස එනවා", "රස්නෙයි", "සීතලයි", "කාලගුණය හොඳයි",
        "මට මේ කාලගුණය කමතියි", "අද කුමන දවසද?", "අද දිනය මොකක්ද?",

        # Places and directions
        "රෝහල කොහෙද?", "පාසල කොහෙද?", "පොළ කොහෙද?",
        "කෙලින් යන්න", "වමට හරවන්න", "දකුණට හරවන්න", "එය මෙතන ලඟයි", "එය මෙතන ඈතයි",
        "මට මුහුදු තීරයට යන්න ඕන", "මට කන්දට යන්න ඕන", "මම නගරයේ ජීවත් වෙනවා",

        # Shopping and money
        "මේක කීයද?", "එය මිල අධිකයි", "එය මිල අඩුයි", "මට මේක ගන්න ඕන",
        "ඔබ ළඟ සුදුවක් තියෙනවාද?", "මට සල්ලි ඕන", "බැංකුව කොහෙද?", "මට ගෙවන්න ඕන",
        "මට වට්ටමක් ලබා ගන්න පුළුවන්ද?", "මේක හරිම මිල අධිකයි", "මම මේක ගන්නම්",

        # Transportation
        "මම බස් එකේ යනවා", "මම කාර් එකේ යනවා", "මම කෝච්චියේ යනවා", "මම පයින් යනවා", "මම පාපැදියේ යනවා",
        "බස් නැවතුම කොහෙද?", "බස් එක කවදාද එන්නෙ?", "මම බස් එක මග හැරුණා",
        "කැබ් එකක් කෝල් කරන්න", "මට ගුවන්තොටුපළට යන්න ඕන", "කාලය කීයක් ගන්නවාද?",

        # Education and work
        "මම පාසලට යනවා", "මම මහන්සියෙන් ඉගෙන ගන්නවා", "මට විභාගයක් තියෙනවා", "මම විභාගයෙන් පාස් උණා",
        "මම විභාගයෙන් ෆේල් උණා", "මම උපාධිය ගත්තා", "මම මහන්සියෙන් වැඩ කරනවා", "මම කාර්යබහුලයි", "මම නිදහස්",
        "මට රැස්වීමක් තියෙනවා", "මම මගේ වැඩ ඉවර කළා", "මට කඩාවැටෙනවා",

        # Health and feelings
        "මම අසනීපයි", "මට සනීප හිතෙනවා", "මට හිස රිදෙනවා", "මට උණ",
        "මට වෛද්‍යවරයෙක් දකින්න ඕන", "මට සතුටුයි", "මට දුකයි", "මට කේන්තියි",
        "මම උද්‍යෝගිමත්", "මම කනස්සල්ලෙන්", "මම පුදුම හිතෙන්", "මම ව්‍යාකූල",

        # Technology and communication
        "මට ෆෝන් එකක් තියෙනවා", "මම පරිගණකයක් පාවිච්චි කරනවා", "මම ඊමේල් එකක් එවනවා", "මම වීඩියෝ බලනවා",
        "මම අන්තර්ජාලය පාවිච්චි කරනවා", "මගේ ෆෝන් එක කැඩුණා", "මට මගේ ෆෝන් එක චාජ් කරන්න ඕන",
        "ඔබට මට කෝල් කරන්න පුළුවන්ද?", "මම ඔබට පස්සේ කෝල් කරන්නම්", "මට පණිවිඩයක් එවන්න",

        # Objects and descriptions
        "පොත මේසය උඩින් තියෙනවා", "කාර් එක රතුයි", "ගෙදර විශාලයි",
        "කාමරය කුඩායි", "කෑම උණුසුම්", "වතුර සීතලයි",
        "ඇය ලස්සනයි", "ඔහු උසයි", "එය අලුත්", "එය පරණයි",
        "මේක හොඳයි", "මේක නරකයි", "එය පිරිසිදුයි", "එය අපිරිසිදුයි",

        # Actions and verbs
        "මට පිහිනනවා පුළුවන්", "මට කියන්න පුළුවන්", "මට නටන්න පුළුවන්", "මට උයන්න පුළුවන්", "මට ධාවනය කරන්න පුළුවන්",
        "මට සිංහල හොඳට කතා කරන්න බෑ", "මම ඉගෙන ගන්නම්", "මට තේරෙනවා", "මට තේරෙන්නේ නෑ",
        "ඔබට මට උදව් කරන්න පුළුවන්ද?", "මම ඔබට උදව් කරන්නම්", "අපි යමු", "මට ඉන්න දෙන්න", "ඉක්මන් කරන්න",

        # Numbers and counting
        "මට පොත්තක් තියෙනවා", "මට සහෝදරියන් දෙන්නෙක් ඉන්නවා", "මට යාළුවෝ තුන්දෙනෙක් ඉන්නවා",
        "මට පුටු හතරක් ඕන", "මම ඇපල් පහක් ගත්තා", "මනුෂ්‍යයෝ හය්දෙනෙක් ඉන්නවා",
        "මට දින හතක් තියෙනවා", "මම පැය අටක් වැඩ කරනවා", "මම පැය නවයක් නිදාගන්නවා", "මට ඇඟිලි දහයක් තියෙනවා"
    ]

    return english_sentences, sinhala_sentences

def create_basic_sample_dataset():
    """
    Create a basic sample dataset as final fallback
    """
    english_sentences = [
        "Hello", "How are you?", "Good morning", "Good evening", "Thank you",
        "You are welcome", "I am fine", "What is your name?", "Nice to meet you",
        "How old are you?", "Where are you from?", "I love you", "Goodbye",
        "See you later", "Have a nice day", "I am hungry", "I am tired",
        "What time is it?", "It is raining", "The weather is nice"
    ]

    sinhala_sentences = [
        "ආයුබෝවන්", "ඔබ කෙසේද?", "සුභ උදෑසනක්", "සුභ සන්ධ්‍යාවක්", "ස්තූතියි",
        "ඔබට ස්තූතියි", "මම හොඳයි", "ඔබගේ නම මොකක්ද?", "ඔබව හමුවීමට සතුටුයි",
        "ඔබට වයස කීයද?", "ඔබ කොහෙන්ද?", "මම ඔබට ආදරෙයි", "ආයුබෝවන්",
        "ඉදිරියෙදී හමුවෙමු", "හොඳ දවසක් ගත කරන්න", "මට කිඩෙන් හිතෙනවා", "මට කඩාවැටෙනවා",
        "වේලාව කීයද?", "වැස්ස එනවා", "කාලගුණය හොඳයි"
    ]

    return english_sentences, sinhala_sentences

def validate_and_clean_data(english_sentences, sinhala_sentences):
    """
    Validate and clean the dataset
    """
    print(f"\nValidating dataset with {len(english_sentences)} sentence pairs...")

    # Remove empty pairs and pairs that are too short or too long
    cleaned_english = []
    cleaned_sinhala = []

    for en, si in zip(english_sentences, sinhala_sentences):
        en = en.strip()
        si = si.strip()

        # Skip if either sentence is empty
        if not en or not si:
            continue

        # Skip if sentences are too short (less than 2 characters)
        if len(en) < 2 or len(si) < 2:
            continue

        # Skip if sentences are too long (more than 200 characters)
        if len(en) > 200 or len(si) > 200:
            continue

        cleaned_english.append(en)
        cleaned_sinhala.append(si)

    print(f"After cleaning: {len(cleaned_english)} valid sentence pairs")
    return cleaned_english, cleaned_sinhala

def save_dataset(english_sentences, sinhala_sentences, filename="english_sinhala_dataset"):
    """
    Save the dataset to files for later use
    """
    # Create a pandas DataFrame
    df = pd.DataFrame({
        'english': english_sentences,
        'sinhala': sinhala_sentences
    })

    # Save as CSV
    df.to_csv(f"{filename}.csv", index=False, encoding='utf-8')

    # Save as JSON
    data = {
        'english': english_sentences,
        'sinhala': sinhala_sentences,
        'count': len(english_sentences)
    }

    with open(f"{filename}.json", 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Dataset saved as {filename}.csv and {filename}.json")


# Download and prepare the dataset
english_sentences, sinhala_sentences = download_and_prepare_data()

# Validate and clean the data
english_sentences, sinhala_sentences = validate_and_clean_data(english_sentences, sinhala_sentences)

# If we have a small dataset, duplicate it with some variations to make it larger
if len(english_sentences) < 1000:
    print(f"\nDataset is small ({len(english_sentences)} pairs). Expanding with variations...")

    # Create variations by adding punctuation, changing case, etc.
    expanded_english = english_sentences[:]
    expanded_sinhala = sinhala_sentences[:]

    # Add variations
    for en, si in zip(english_sentences, sinhala_sentences):
        # Add punctuation variations
        if not en.endswith('.') and not en.endswith('?') and not en.endswith('!'):
            expanded_english.append(en + '.')
            expanded_sinhala.append(si + '.')

        # Add variations with different cases
        if en.islower():
            expanded_english.append(en.capitalize())
            expanded_sinhala.append(si)

    english_sentences = expanded_english
    sinhala_sentences = expanded_sinhala

    print(f"Expanded dataset to {len(english_sentences)} sentence pairs")

# Save the dataset
save_dataset(english_sentences, sinhala_sentences)

# Display sample data
print("\nSample data from the final dataset:")
sample_size = min(10, len(english_sentences))
for i in range(sample_size):
    print(f"English: {english_sentences[i]}")
    print(f"Sinhala: {sinhala_sentences[i]}")
    print("---")

print(f"\nFinal dataset ready with {len(english_sentences)} sentence pairs!")
print("You can now use this data to train your transformer model.")

# Return the data for use in your transformer training
print("\nTo use this data in your transformer training, you can access:")
print("- english_sentences: List of English sentences")
print("- sinhala_sentences: List of corresponding Sinhala sentences")

Attempting to load English-Sinhala translation datasets...

1. Trying OPUS-100 dataset...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


✗ OPUS-100 failed: Invalid pattern: '**' can only be an entire path component

2. Trying alternative OPUS-100 loading method...
✗ Alternative OPUS-100 failed: Invalid pattern: '**' can only be an entire path component

3. Trying NLPC-UOM sentence alignment dataset...
✗ NLPC-UOM failed: Invalid pattern: '**' can only be an entire path component

4. Trying to download from Tatoeba Challenge dataset...
✓ Using comprehensive sample dataset with 181 sentence pairs

Validating dataset with 181 sentence pairs...
After cleaning: 181 valid sentence pairs

Dataset is small (181 pairs). Expanding with variations...
Expanded dataset to 342 sentence pairs
Dataset saved as english_sinhala_dataset.csv and english_sinhala_dataset.json

Sample data from the final dataset:
English: Hello
Sinhala: ආයුබෝවන්
---
English: Hi
Sinhala: හායි
---
English: Good morning
Sinhala: සුභ උදෑසනක්
---
English: Good afternoon
Sinhala: සුභ මධ්‍යාහ්නයක්
---
English: Good evening
Sinhala: සුභ සන්ධ්‍යාවක්
---
English: Good n

In [5]:
# Fixed Dataset class
class TranslationDataset(Dataset):
    def __init__(self, english_sentences, sinhala_sentences):
        # Ensure inputs are lists
        if not isinstance(english_sentences, list):
            english_sentences = list(english_sentences)
        if not isinstance(sinhala_sentences, list):
            sinhala_sentences = list(sinhala_sentences)

        self.english_sentences = english_sentences
        self.sinhala_sentences = sinhala_sentences

        # Validate that both lists have the same length
        assert len(self.english_sentences) == len(self.sinhala_sentences), \
            f"Mismatch in sentence counts: {len(self.english_sentences)} vs {len(self.sinhala_sentences)}"

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        # Ensure idx is an integer
        if isinstance(idx, (list, tuple)):
            idx = idx[0]

        # Validate index
        if idx >= len(self.english_sentences) or idx < 0:
            raise IndexError(f"Index {idx} out of range for dataset of size {len(self.english_sentences)}")

        return self.english_sentences[idx], self.sinhala_sentences[idx]

def create_vocabulary(sentences, min_freq=1):
    # Character-level tokenization with cleaning
    char_counter = Counter()

    for sentence in sentences:
        # Clean the sentence and count characters
        cleaned_sentence = sentence.strip()
        for char in cleaned_sentence:
            char_counter[char] += 1

    # Create vocabulary with special tokens first
    vocab = ['<PAD>', '<START>', '<END>', '<UNK>']

    # Add special characters that might appear in special tokens
    special_chars = ['<', '>']
    for char in special_chars:
        if char not in vocab:
            vocab.append(char)

    # Add characters sorted by frequency (most common first)
    for char, freq in char_counter.most_common():
        if freq >= min_freq and char not in vocab:
            vocab.append(char)

    # Create index mappings
    char_to_index = {char: i for i, char in enumerate(vocab)}
    index_to_char = {i: char for i, char in enumerate(vocab)}

    return char_to_index, index_to_char, vocab


# Create vocabularies
print("Creating vocabularies...")
english_to_index, index_to_english, english_vocab = create_vocabulary(english_sentences)
sinhala_to_index, index_to_sinhala, sinhala_vocab = create_vocabulary(sinhala_sentences)

print(f"English vocab size: {len(english_vocab)}")
print(f"Sinhala vocab size: {len(sinhala_vocab)}")
print(f"Sample English vocab: {english_vocab[:20]}")
print(f"Sample Sinhala vocab: {sinhala_vocab[:20]}")

Creating vocabularies...
English vocab size: 54
Sinhala vocab size: 66
Sample English vocab: ['<PAD>', '<START>', '<END>', '<UNK>', '<', '>', ' ', 'e', 'a', 'o', 'i', 't', 's', 'I', 'r', 'n', 'h', '.', 'd', 'm']
Sample Sinhala vocab: ['<PAD>', '<START>', '<END>', '<UNK>', '<', '>', ' ', 'ම', 'න', '්', 'ය', 'ව', 'ක', 'ි', 'ා', '.', 'ු', 'ර', 'ට', 'ත']


##Initializing Model

In [6]:
# Define special tokens
START_TOKEN = '<START>'
END_TOKEN = '<END>'
PADDING_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'

# Model hyperparameters (optimized for small dataset)
d_model = 128
ffn_hidden = 256
num_heads = 4
drop_prob = 0.1
num_layers = 3
max_sequence_length = 64
learning_rate = 0.001
batch_size = 8
num_epochs = 100

model = Transformer(
    d_model=d_model,
    ffn_hidden=ffn_hidden,
    num_heads=num_heads,
    drop_prob=drop_prob,
    num_layers=num_layers,
    max_sequence_length=max_sequence_length,
    si_vocab_size=len(sinhala_vocab),
    english_to_index=english_to_index,
    sinhala_to_index=sinhala_to_index,
    START_TOKEN=START_TOKEN,
    END_TOKEN=END_TOKEN,
    PADDING_TOKEN=PADDING_TOKEN
).to(device)

# Print model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total model parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Create train/validation split
train_size = int(0.8 * len(english_sentences))
val_size = len(english_sentences) - train_size

# Shuffle data before splitting
combined = list(zip(english_sentences, sinhala_sentences))
random.shuffle(combined)
english_sentences, sinhala_sentences = zip(*combined)

train_english = list(english_sentences[:train_size])
train_sinhala = list(sinhala_sentences[:train_size])
val_english = list(english_sentences[train_size:])
val_sinhala = list(sinhala_sentences[train_size:])

print(f"Train samples: {len(train_english)}")
print(f"Validation samples: {len(val_english)}")

# Create datasets and dataloaders
train_dataset = TranslationDataset(train_english, train_sinhala)
val_dataset = TranslationDataset(val_english, val_sinhala)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=sinhala_to_index[PADDING_TOKEN])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

Total model parameters: 1,017,666
Trainable parameters: 1,017,666
Train samples: 273
Validation samples: 69




In [7]:
# Helper function to create attention masks
def create_masks(src_len, tgt_len, device):
    # Create decoder self-attention mask (causal mask)
    decoder_mask = torch.triu(torch.ones(tgt_len, tgt_len), diagonal=1).bool()
    decoder_mask = decoder_mask.unsqueeze(0).unsqueeze(0).to(device)
    return None, decoder_mask, None

# Improved training function with better error handling
def train_epoch_improved(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    num_batches = 0

    # Use tqdm for progress bar
    progress_bar = tqdm(dataloader, desc="Training", leave=False)

    for batch_idx, batch in enumerate(progress_bar):
        english_batch = batch["english"]
        sinhala_batch = batch["sinhala"]

        try:
            # Clear gradients
            optimizer.zero_grad()

            # Prepare input and target sequences
            target_input = [START_TOKEN + sentence for sentence in sinhala_batch]
            target_output = [sentence + END_TOKEN for sentence in sinhala_batch]

            # Forward pass
            outputs = model(
                english_batch,
                target_input,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False,
                dec_end_token=False
            )

            # Convert target sequences to indices
            target_indices = []
            for sentence in target_output:
                indices = []
                for char in sentence:
                    if char in sinhala_to_index:
                        indices.append(sinhala_to_index[char])
                    else:
                        indices.append(sinhala_to_index[UNK_TOKEN])

                # Pad or truncate to max_sequence_length
                if len(indices) > max_sequence_length:
                    indices = indices[:max_sequence_length]
                else:
                    while len(indices) < max_sequence_length:
                        indices.append(sinhala_to_index[PADDING_TOKEN])
                target_indices.append(indices)

            # Convert to tensor
            target_tensor = torch.tensor(target_indices, dtype=torch.long).to(device)

            # Calculate loss
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), target_tensor.reshape(-1))

            # Backward pass
            loss.backward()

            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Update parameters
            optimizer.step()

            # Update statistics
            total_loss += loss.item()
            num_batches += 1

            # Update progress bar
            progress_bar.set_postfix({'Loss': f'{loss.item():.4f}'})

        except Exception as e:
            print(f"Error in batch {batch_idx}: {e}")
            print(f"English batch: {english_batch}")
            print(f"Sinhala batch: {sinhala_batch}")
            continue

    return total_loss / num_batches if num_batches > 0 else 0

# Fixed validation function
def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    num_batches = 0

    with torch.no_grad():
        for english_batch, sinhala_batch in dataloader:
            try:
                target_input = [START_TOKEN + sentence for sentence in sinhala_batch]
                target_output = [sentence + END_TOKEN for sentence in sinhala_batch]

                outputs = model(
                    english_batch,
                    target_input,
                    encoder_self_attention_mask=None,
                    decoder_self_attention_mask=None,
                    decoder_cross_attention_mask=None,
                    enc_start_token=False,
                    enc_end_token=False,
                    dec_start_token=False,
                    dec_end_token=False
                )

                target_indices = []
                for sentence in target_output:
                    indices = []
                    for char in sentence:
                        if char in sinhala_to_index:
                            indices.append(sinhala_to_index[char])
                        else:
                            indices.append(sinhala_to_index[UNK_TOKEN])

                    if len(indices) > max_sequence_length:
                        indices = indices[:max_sequence_length]
                    else:
                        while len(indices) < max_sequence_length:
                            indices.append(sinhala_to_index[PADDING_TOKEN])
                    target_indices.append(indices)

                target_tensor = torch.tensor(target_indices, dtype=torch.long).to(device)
                loss = criterion(outputs.reshape(-1, outputs.size(-1)), target_tensor.reshape(-1))

                total_loss += loss.item()
                num_batches += 1

            except Exception as e:
                print(f"Error in validation batch: {e}")
                continue

    return total_loss / num_batches if num_batches > 0 else float('inf')

In [8]:
# Create datasets with proper error handling
def create_datasets(english_sentences, sinhala_sentences, train_split=0.8):
    """
    Create train and validation datasets with proper error handling
    """
    print(f"Creating datasets from {len(english_sentences)} sentence pairs...")

    # Validate inputs
    if not english_sentences or not sinhala_sentences:
        raise ValueError("Empty sentence lists provided")

    if len(english_sentences) != len(sinhala_sentences):
        raise ValueError(f"Sentence count mismatch: {len(english_sentences)} vs {len(sinhala_sentences)}")

    # Create indices for train/validation split
    num_samples = len(english_sentences)
    indices = list(range(num_samples))

    # Shuffle indices
    import random
    random.shuffle(indices)

    # Split data
    train_size = int(train_split * num_samples)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:]

    # Create train dataset
    train_english = [english_sentences[i] for i in train_indices]
    train_sinhala = [sinhala_sentences[i] for i in train_indices]

    # Create validation dataset
    val_english = [english_sentences[i] for i in val_indices]
    val_sinhala = [sinhala_sentences[i] for i in val_indices]

    # Create dataset objects
    train_dataset = TranslationDataset(train_english, train_sinhala)
    val_dataset = TranslationDataset(val_english, val_sinhala)

    print(f"Train dataset: {len(train_dataset)} samples")
    print(f"Validation dataset: {len(val_dataset)} samples")

    return train_dataset, val_dataset

##Training

In [9]:

# Fixed training loop
def train_model(model, train_dataset, val_dataset, num_epochs=100, batch_size=32, learning_rate=0.001):
    """
    Complete training function with proper error handling
    """
    # Create data loaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,  # Set to 0 for Colab compatibility
        pin_memory=True if torch.cuda.is_available() else False
    )

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,  # Set to 0 for Colab compatibility
        pin_memory=True if torch.cuda.is_available() else False
    )

    # Initialize optimizer and criterion
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    criterion = nn.CrossEntropyLoss(ignore_index=sinhala_to_index[PADDING_TOKEN])

    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5, verbose=True
    )

    # Training tracking
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience_counter = 0
    max_patience = 10

    print("Starting training...")

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")

        # Training
        train_loss = train_epoch_improved(model, train_dataloader, optimizer, criterion, device)
        train_losses.append(train_loss)

        # Validation
        val_loss = validate_epoch(model, val_dataloader, criterion, device)
        val_losses.append(val_loss)

        # Learning rate scheduling
        scheduler.step(val_loss)

        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0

            # Save model
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch,
                'val_loss': val_loss,
                'train_loss': train_loss,
                'english_to_index': english_to_index,
                'sinhala_to_index': sinhala_to_index,
                'index_to_english': index_to_english,
                'index_to_sinhala': index_to_sinhala,
                'english_vocab': english_vocab,
                'sinhala_vocab': sinhala_vocab
            }, 'best_transformer_model.pth')

            print(f"New best model saved! Val Loss: {val_loss:.4f}")
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= max_patience:
            print(f"Early stopping triggered after {patience_counter} epochs without improvement!")
            break

    print(f"Training completed! Best validation loss: {best_val_loss:.4f}")

    # Save vocabularies separately
    with open('vocabularies.pkl', 'wb') as f:
        pickle.dump({
            'english_to_index': english_to_index,
            'index_to_english': index_to_english,
            'english_vocab': english_vocab,
            'sinhala_to_index': sinhala_to_index,
            'index_to_sinhala': index_to_sinhala,
            'sinhala_vocab': sinhala_vocab
        }, f)

    return train_losses, val_losses

In [10]:
# Create datasets
train_dataset, val_dataset = create_datasets(english_sentences, sinhala_sentences)

# Train the model
train_losses, val_losses = train_model(
    model=model,  # Your transformer model
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    num_epochs=100,
    batch_size=16,  # Reduced for Colab memory constraints
    learning_rate=0.001
)

print("Training completed successfully!")

Creating datasets from 342 sentence pairs...
Train dataset: 273 samples
Validation dataset: 69 samples
Starting training...

Epoch 1/100




TypeError: tuple indices must be integers or slices, not str

In [20]:
# Start training
print("Starting training...")
train_losses = []
val_losses = []
best_val_loss = float('inf')

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    # Training
    train_loss = train_epoch_improved(model, train_dataloader, optimizer, criterion, device)
    train_losses.append(train_loss)

    # Validation (similar improvements)
    model.eval()
    val_loss = 0
    val_batches = 0

    with torch.no_grad():
        for english_batch, sinhala_batch in val_dataloader:
            target_input = [START_TOKEN + sentence for sentence in sinhala_batch]
            target_output = [sentence + END_TOKEN for sentence in sinhala_batch]

            outputs = model(
                english_batch,
                target_input,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False,
                dec_end_token=False
            )

            target_indices = []
            for sentence in target_output:
                indices = []
                for char in sentence:
                    if char in sinhala_to_index:
                        indices.append(sinhala_to_index[char])
                    else:
                        indices.append(sinhala_to_index[UNK_TOKEN])

                if len(indices) > max_sequence_length:
                    indices = indices[:max_sequence_length]
                else:
                    while len(indices) < max_sequence_length:
                        indices.append(sinhala_to_index[PADDING_TOKEN])
                target_indices.append(indices)

            target_tensor = torch.tensor(target_indices, dtype=torch.long).to(device)
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), target_tensor.reshape(-1))

            val_loss += loss.item()
            val_batches += 1

    val_loss = val_loss / val_batches
    val_losses.append(val_loss)

    # Learning rate scheduling
    scheduler.step(val_loss)

    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch,
            'val_loss': val_loss,
            'train_loss': train_loss,
            'english_to_index': english_to_index,
            'sinhala_to_index': sinhala_to_index,
            'index_to_english': index_to_english,
            'index_to_sinhala': index_to_sinhala,
            'english_vocab': english_vocab,
            'sinhala_vocab': sinhala_vocab
        }, 'best_transformer_model.pth')
        print(f"New best model saved! Val Loss: {val_loss:.4f}")

    # Early stopping
    if epoch > 20 and val_loss > best_val_loss * 1.1:
        print("Early stopping triggered!")
        break

print(f"Training completed! Best validation loss: {best_val_loss:.4f}")

# Save vocabularies separately
with open('vocabularies.pkl', 'wb') as f:
    pickle.dump({
        'english_to_index': english_to_index,
        'index_to_english': index_to_english,
        'english_vocab': english_vocab,
        'sinhala_to_index': sinhala_to_index,
        'index_to_sinhala': index_to_sinhala,
        'sinhala_vocab': sinhala_vocab
    }, f)

print("Training completed successfully!")

Starting training...

Epoch 1/100


Training:   0%|          | 0/35 [00:00<?, ?it/s]


TypeError: list indices must be integers or slices, not list

In [13]:
# Fixed translation functions with proper bounds checking
def translate_greedy(model, english_sentence, english_to_index, sinhala_to_index, index_to_sinhala, max_length=128):
    model.eval()
    with torch.no_grad():
        # Start with START token
        translated = START_TOKEN

        for _ in range(max_length):
            # Forward pass
            outputs = model(
                [english_sentence],
                [translated],
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False,
                dec_end_token=False
            )

            # Check if we've reached the output sequence limit
            if len(translated) >= outputs.shape[1]:
                print(f"Warning: Reached maximum output sequence length ({outputs.shape[1]})")
                break

            # Get the next character prediction
            next_char_logits = outputs[0, len(translated), :]
            next_char_id = torch.argmax(next_char_logits).item()
            next_char = index_to_sinhala[next_char_id]

            # Stop if END token is generated
            if next_char == END_TOKEN:
                break

            # Add to translated sentence
            translated += next_char

        # Remove START token
        translated = translated.replace(START_TOKEN, '')
        return translated

def translate_beam_search(model, english_sentence, english_to_index, sinhala_to_index, index_to_sinhala, beam_size=3, max_length=128):
    model.eval()

    with torch.no_grad():
        # Initialize beam with START token
        beams = [(START_TOKEN, 0.0)]  # (sequence, score)

        for _ in range(max_length):
            new_beams = []

            for sequence, score in beams:
                # Check if sequence ended
                if sequence.endswith(END_TOKEN):
                    new_beams.append((sequence, score))
                    continue

                # Forward pass
                outputs = model(
                    [english_sentence],
                    [sequence],
                    encoder_self_attention_mask=None,
                    decoder_self_attention_mask=None,
                    decoder_cross_attention_mask=None,
                    enc_start_token=False,
                    enc_end_token=False,
                    dec_start_token=False,
                    dec_end_token=False
                )

                # Check if we've reached the output sequence limit
                if len(sequence) >= outputs.shape[1]:
                    new_beams.append((sequence + END_TOKEN, score))
                    continue

                # Get probabilities for next character
                next_char_logits = outputs[0, len(sequence), :]
                next_char_probs = torch.softmax(next_char_logits, dim=-1)

                # Get top k candidates
                top_k_probs, top_k_indices = torch.topk(next_char_probs, beam_size)

                # Create new beam candidates
                for prob, idx in zip(top_k_probs, top_k_indices):
                    next_char = index_to_sinhala[idx.item()]
                    new_sequence = sequence + next_char
                    new_score = score + torch.log(prob).item()
                    new_beams.append((new_sequence, new_score))

            # Select top beam_size beams
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]

            # Check if all beams ended
            if all(seq.endswith(END_TOKEN) for seq, _ in beams):
                break

        # Return best sequence
        best_sequence = beams[0][0]
        # Remove special tokens
        best_sequence = best_sequence.replace(START_TOKEN, '').replace(END_TOKEN, '')
        return best_sequence

# Alternative approach: Use a smaller max_length that matches your model's training
def translate_greedy_safe(model, english_sentence, english_to_index, sinhala_to_index, index_to_sinhala, max_length=60):
    """
    Safer version that uses a max_length smaller than the model's sequence limit
    """
    model.eval()
    with torch.no_grad():
        # Start with START token
        translated = START_TOKEN

        for step in range(max_length):
            # Forward pass
            outputs = model(
                [english_sentence],
                [translated],
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False,
                dec_end_token=False
            )

            # Get the next character prediction
            current_pos = len(translated)
            if current_pos >= outputs.shape[1]:
                print(f"Warning: Reached model's sequence limit at step {step}")
                break

            next_char_logits = outputs[0, current_pos, :]
            next_char_id = torch.argmax(next_char_logits).item()
            next_char = index_to_sinhala[next_char_id]

            # Stop if END token is generated
            if next_char == END_TOKEN:
                break

            # Add to translated sentence
            translated += next_char

        # Remove START token
        translated = translated.replace(START_TOKEN, '')
        return translated

# Load the best model
print("Loading best model...")
checkpoint = torch.load('best_transformer_model.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Loaded model from epoch {checkpoint['epoch']} with validation loss {checkpoint['val_loss']:.4f}")

Loading best model...
Loaded model from epoch 17 with validation loss 2.5662


##Testing

In [14]:
# Test translation function
print("\n" + "="*50)
print("TESTING TRANSLATION")
print("="*50)

# Test with training samples
test_indices = random.sample(range(len(val_english)), min(5, len(val_english)))

for idx in test_indices:
    english_text = val_english[idx]
    expected_sinhala = val_sinhala[idx]

    translation_greedy = translate_greedy(model, english_text, english_to_index, sinhala_to_index, index_to_sinhala)

    print(f"English: {english_text}")
    print(f"Expected: {expected_sinhala}")
    print(f"Generated: {translation_greedy}")
    print("-" * 50)


TESTING TRANSLATION
English: How much does it cost?
Expected: මේක කීයද?
Generated: ><UNK><UNK><UNK><UNK><UNK>>><UNK>><UNK><UNK>>><UNK>><UNK>
--------------------------------------------------
English: I am sad
Expected: මට දුකයි
Generated: <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
--------------------------------------------------
English: Thank you
Expected: ස්තූතියි
Generated: <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
--------------------------------------------------
English: They are students
Expected: ඔවුන් සිසුන්
Generated: <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>ිි
--------------------------------------------------
English: I like this food
Expected: මට මේ කෑම කමතියි
Generated: ><UNK><UNK><UNK><UNK><UNK>><UNK>><UNK><UNK>>>><UNK><UNK>ය
--------------------------------------------------


In [15]:
# Interactive translation
print("\n" + "="*50)
print("INTERACTIVE TRANSLATION")
print("="*50)
print("Enter English text to translate (type 'quit' to exit):")

while True:
    user_input = input("\nEnglish: ").strip()
    if user_input.lower() == 'quit':
        break

    if not user_input:
        continue

    try:
        # Try both translation methods
        translation_greedy = translate_greedy(model, user_input, english_to_index, sinhala_to_index, index_to_sinhala)
        print(f"Sinhala (Greedy): {translation_greedy}")

        # Uncomment the line below for beam search (slower but potentially better)
        # translation_beam = translate_beam_search(model, user_input, english_to_index, sinhala_to_index, index_to_sinhala)
        # print(f"Sinhala (Beam): {translation_beam}")

    except Exception as e:
        print(f"Error: {e}")
        print("This might happen if the input contains characters not in the vocabulary.")

print("Translation session ended!")


INTERACTIVE TRANSLATION
Enter English text to translate (type 'quit' to exit):

English: Hi
Sinhala (Greedy): <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>

English: quit
Translation session ended!


In [None]:
# Save final model info
print("\n" + "="*50)
print("TRAINING SUMMARY")
print("="*50)
print(f"Total epochs: {num_epochs}")
print(f"Best validation loss: {best_val_loss:.4f}")
print(f"Final training loss: {train_losses[-1]:.4f}")
print(f"Final validation loss: {val_losses[-1]:.4f}")
print(f"English vocabulary size: {len(english_vocab)}")
print(f"Sinhala vocabulary size: {len(sinhala_vocab)}")
print(f"Model parameters: {total_params:,}")
print("="*50)