In [5]:
from datasets import load_dataset

# Load English–Tamil parallel dataset
dataset = load_dataset("Helsinki-NLP/opus-100", "en-ta")

# Inspect example
print(dataset["train"][0])
# -> {'translation': {'en': 'Hello!', 'ta': 'வணக்கம்!'}}

# Save Tamil sentences to .ta file
with open("tamil_sentences.ta", "w", encoding="utf-8") as f:
    for ex in dataset["train"]:
        f.write(ex["translation"]["ta"] + "\n")

{'translation': {'en': 'The likeness of those who disbelieve in their Lord: their works are like ashes, in a fierce wind, on a stormy day. They have no control over anything they have earned. That is the utmost misguidance.', 'ta': 'எவர்கள் தங்களுடைய இறைவனை நிராகரிக்கிறார்களோ, அவர்களுக்கு உதாரணமாவது அவர்களுடைய செயல்கள் சாம்பல் போன்றவை புயல் காற்று கடினமாக வீசம் நாளில் அச்சாம்பலைக் காற்று அடித்துக் கொண்டு போய்விட்டது. (அவ்வாறே) தாங்கள் சம்பாதித்த பொருள்களில் எதன் மீதும் அவர்களுக்கு அதிகாரம் இராது இதுவே வெகு தூரமான வழிகேடாகும்.'}}


In [11]:
import re

TAMIL_UNICODE_RANGE = r'\u0B80-\u0BFF'
ALLOWED_SYMBOLS = r'\.,()\[\]{}'

# Regex to remove unwanted characters
remove_non_tamil = re.compile(fr'[^{TAMIL_UNICODE_RANGE}{ALLOWED_SYMBOLS}\s]')

# Regex to remove empty or non-Tamil-only brackets
remove_bad_brackets = re.compile(
    fr'''
    [\(\[\{{]               # opening bracket
    [^{TAMIL_UNICODE_RANGE}]*  # anything but Tamil
    [\)\]\}}]               # closing bracket
    ''', re.VERBOSE
)

with open("tamil_sentences.ta", "r", encoding="utf-8") as infile:
    lines = infile.readlines()

cleaned_lines = []
for line in lines:
    # Remove unwanted non-Tamil characters (but keep allowed symbols)
    cleaned_line = remove_non_tamil.sub('', line)

    # Remove bad brackets like (), (,), {}, etc.
    cleaned_line = remove_bad_brackets.sub('', cleaned_line)

    cleaned_line = cleaned_line.strip()
    if cleaned_line:
        cleaned_lines.append(cleaned_line)

with open("tamil_sentences.ta", "w", encoding="utf-8") as outfile:
    for line in cleaned_lines:
        outfile.write(line + "\n")


In [23]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()


# Dataset identifier in "owner/dataset-name" format
dataset = "vijayabhaskar96/tamil-news-classification-dataset-tamilmurasu"

# Download to current working directory
cwd = os.getcwd()
api.dataset_download_files(dataset, path=cwd, unzip=True)


Dataset URL: https://www.kaggle.com/datasets/vijayabhaskar96/tamil-news-classification-dataset-tamilmurasu


In [24]:
import pandas as pd

# Load your CSV file
df = pd.read_csv("tamilmurasu_dataset.csv")

# Select only the last three columns by name
columns_to_extract = ["news_category", "news_title", "news_article"]
df_selected = df[columns_to_extract]

# Save the combined text of these columns into a .ta file, line by line
with open("tamil_sentences-2.ta", "w", encoding="utf-8") as f:
    for _, row in df_selected.iterrows():
        # Combine the three columns (you can customize the separator)
        line = f"{row['news_title']} | {row['news_article']} | {row['news_category']}"
        f.write(line + "\n")

In [25]:
from datasets import load_dataset

# Load the Tamil Kavithai dataset
ds = load_dataset("abishekmahi/tamil-kavithai")

with open("tamil_kavithai.ta", "w", encoding="utf-8") as f:
    for example in ds["train"]:
        content = example.get("Content")
        if content is not None:
            f.write(content + "\n")

In [7]:
import os
import glob
import argparse
from tokenizers import Tokenizer
from tokenizers.trainers import WordPieceTrainer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import Sequence, NFC, Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import decoders
from tokenizers.processors import TemplateProcessing

In [8]:
special_token_dict = {"unknown_token": "[UNK]",
                      "pad_token": "[PAD]", 
                      "start_token": "[BOS]",
                      "end_token": "[EOS]"}

In [26]:
def train_tokenizer(path_to_data_root):
    """
    Train a WordPiece tokenizer specifically for Tamil text data.

    Key settings:
    - unknown_token: Used when the tokenizer encounters out-of-vocabulary words.
    - pad_token: Used for padding Tamil sentences during batching.
    - start_token: Prepended to Tamil input so the decoder knows where to start generation.
    - end_token: Appended to the Tamil text to signal where decoding should stop.

    Unicode Normalization:
    Tamil letters can also have alternate Unicode representations (like combining forms).
    NFC normalization ensures consistent representation using composed forms.
    """

    # Define the tokenizer with WordPiece model and unknown token
    tokenizer = Tokenizer(WordPiece(unk_token=special_token_dict["unknown_token"]))

    # Apply Unicode normalization and lowercase (optional for Tamil but kept for consistency)
    tokenizer.normalizer = Sequence([NFC(), Lowercase()])

    # Split text into tokens using whitespace
    tokenizer.pre_tokenizer = Whitespace()

    # Find all Tamil text files (.ta extension)
    tamil_files = glob.glob(os.path.join(path_to_data_root, "**/*.ta"), recursive=True)

    # Train a WordPiece tokenizer with vocab size and special tokens
    trainer = WordPieceTrainer(
        vocab_size=32000,
        special_tokens=list(special_token_dict.values())
    )

    # Train tokenizer on Tamil files
    tokenizer.train(tamil_files, trainer)

    # Save the trained tokenizer model
    os.makedirs("trained_tokenizer", exist_ok=True)
    tokenizer.save("trained_tokenizer/tamil_wp.json")

In [27]:
path_to_data_root = "."
train_tokenizer(path_to_data_root)