In [2]:
import pandas as pd

df = pd.read_csv("/content/tamil_tanglish.csv")


tamil_texts = df["Tamillyrics"].dropna().astype(str)

tamil_texts = tamil_texts.str.replace(r"[\[\]']", "", regex=True)

with open("tamil.txt", "w", encoding="utf-8") as f:
    for text in tamil_texts:
        f.write(text + "\n")

print("Texts successfully saved in tamil.txt ✅")


Texts successfully saved in tamil.txt ✅


In [3]:
!pip install sentencepiece



In [4]:
from fastai.text import *
from pathlib import *
from functools import partial
from tqdm import tqdm
import re
import string
import shutil
import hashlib

In [5]:
from fastai.text import *

In [6]:
!mkdir tamil_texts

In [7]:
base_dir = Path("/content")
flist = list((base_dir/'tamil_texts').glob("*"))
print(flist)

[PosixPath('/content/tamil_texts/tamil.txt')]


In [8]:
custom_symbols = ["xxfld", "xxmaj", "xxup", "xxrep", "xxwrep"]
str_specialcases = ",".join(custom_symbols)
print(str_specialcases)


xxfld,xxmaj,xxup,xxrep,xxwrep


In [None]:
#spm.SentencePieceTrainer.Train(f'--input={flist} --model_prefix=taen_spm --vocab_size=8000 --input_sentence_size=22500000 --unk_id=0 --bos_id=1 --eos_id=2 --pad_id=3 --unk_piece={text.transform.UNK} --bos_piece={text.transform.BOS} --eos_piece={text.transform.EOS} --pad_piece={text.transform.PAD} --user_defined_symbols={str_specialcases}')


In [9]:
file_path = "/content/tamil_texts/tamil.txt"

In [10]:
from pathlib import Path

flist = Path("/content/tamil_texts/tamil.txt")

if not flist.exists():
    print(f"Error: File not found at {flist}")
else:
    print(f"File found: {flist}")


File found: /content/tamil_texts/tamil.txt


In [11]:
import sentencepiece as spm

In [14]:
special_tokens = {
    "UNK": "<unk>",
    "BOS": "<s>",
    "EOS": "</s>",
    "PAD": "<pad>"
}



spm.SentencePieceTrainer.Train(
    f"--input={file_path} --model_prefix=taen_spm --vocab_size=15191 "
    f"--input_sentence_size=22500000 --unk_id=0 --bos_id=1 --eos_id=2 --pad_id=3 "
    f"--unk_piece={special_tokens['UNK']} --bos_piece={special_tokens['BOS']} "
    f"--eos_piece={special_tokens['EOS']} --pad_piece={special_tokens['PAD']} "
    f"--user_defined_symbols={str_specialcases}"
)


In [15]:
vs = 15191

In [16]:
sp = spm.SentencePieceProcessor()


In [17]:
sp.Load("taen_spm.model")

True

In [18]:
sp.EncodeAsPieces('how are you'.lower())


['▁ho', 'w', '▁', 'are', '▁you']

In [19]:
sp.EncodeAsPieces('I am not satisfied with you'.lower())


['▁i', '▁am', '▁not', '▁sa', 't', 'is', 'fie', 'd', '▁wi', 'th', '▁you']

In [20]:
itos = [sp.IdToPiece(int(i)) for i in range(vs)]


In [21]:
itos

['<unk>',
 '<s>',
 '</s>',
 '<pad>',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 ',',
 '!',
 '▁,',
 '?',
 '...',
 '▁நீ',
 '▁',
 '▁என்',
 '்',
 '▁நான்',
 '▁உன்',
 'ும்',
 '....',
 ':',
 'ே',
 'ா',
 'ம்',
 '▁வா',
 '▁-',
 'ு',
 '▁ஒரு',
 'க்',
 '?...',
 '▁காதல்',
 'ப்',
 '▁என்ன',
 '...,',
 '?....',
 'ை',
 'ாய்',
 'ன்',
 'ில்',
 '▁ஹே',
 '▁ராஜா',
 'ோ',
 'த்',
 'ல்',
 'ச்',
 '▁ஏன்',
 ')',
 '▁எந்தன்',
 '▁உந்தன்',
 '▁(',
 '▁மீனா',
 '▁விஜய்',
 '▁தான்',
 'ின்',
 '▁இந்த',
 '▁இது',
 '▁ப',
 'கள்',
 'தான்',
 '▁அது',
 '▁க',
 '▁எல்லாம்',
 '▁போல',
 'து',
 '▁அந்த',
 '▁:',
 '▁ஓ',
 '▁பவித்ரா',
 '▁இல்லை',
 '▁கொஞ்சம்',
 '▁நீங்க',
 'ல',
 'டா',
 '▁என்னை',
 '▁அவன்',
 '▁ஆட',
 '▁உன்னை',
 '▁போ',
 '▁இல்ல',
 '▁சந்தோஷ்',
 '▁நானும்',
 '▁என',
 '▁போலே',
 '▁டா',
 '.',
 '▁ம',
 '▁வந்து',
 '▁நீயும்',
 'ன்னு',
 '▁என்று',
 'ேன்',
 'யே',
 '▁ஓர்',
 '▁எனை',
 'ி',
 'ெல்லாம்',
 '▁போதும்',
 '.....',
 'ங்க',
 '▁கூட',
 '!...',
 '▁சித்தப்பா',
 '▁வானம்',
 '▁யார்',
 'வா',
 '▁உனை',
 'u',
 '▁உங்க',
 '▁த',
 'க்கு',
 '▁உள்ளே',
 'ங்கள

In [22]:
[tok for tok in itos if len(tok) < 2]


[',',
 '!',
 '?',
 '▁',
 '்',
 ':',
 'ே',
 'ா',
 'ு',
 'ை',
 'ோ',
 ')',
 'ல',
 '.',
 'ி',
 'u',
 'y',
 ';',
 't',
 '’',
 'ன',
 'க',
 's',
 '\\',
 'ற',
 'ஆ',
 'e',
 'உ',
 'ய',
 'த',
 '-',
 'அ',
 'ட',
 'வ',
 'i',
 'ர',
 'm',
 'f',
 'o',
 'a',
 'n',
 'd',
 'l',
 'k',
 'ீ',
 'ம',
 'g',
 'ப',
 'L',
 'G',
 'h',
 'ெ',
 'z',
 'ூ',
 'J',
 'O',
 'w',
 'p',
 'ஏ',
 'A',
 'ச',
 'j',
 'N',
 'ā',
 'ஓ',
 'ள',
 'ண',
 'r',
 '(',
 'T',
 'W',
 '0',
 'ஹ',
 'ந',
 'M',
 'P',
 'B',
 'v',
 'ஸ',
 'ஷ',
 'ழ',
 '1',
 'b',
 '2',
 'I',
 'D',
 'Y',
 'ொ',
 'S',
 'ஐ',
 'C',
 'c',
 'ௌ',
 '8',
 'ஞ',
 'ஃ',
 'ங',
 'ஜ',
 '–',
 'H',
 'ஈ',
 'ஊ',
 'ஒ',
 'இ',
 'எ']

In [23]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [24]:
import fastai, torch
fastai.__version__ , torch.__version__

('2.7.18', '2.5.1+cu124')

In [25]:
!pwd

/content


In [26]:
from fastai.text import *
from typing import List

In [27]:
# Define preprocessing functions
def lower_case_everything(t: str) -> str:
    return t.lower()

def replace_all_caps(tokens: List[str]) -> List[str]:
    return [f'xxup {t.lower()}' if t.isupper() else t for t in tokens]

def deal_caps(tokens: List[str]) -> List[str]:
    return [f'xxmaj {t}' if t.istitle() else t for t in tokens]

def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

In [28]:
sp = spm.SentencePieceProcessor()
sp.Load(str("/content/taen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(vs)]

In [29]:
len(itos)

15191

In [30]:
itos[:20]

['<unk>',
 '<s>',
 '</s>',
 '<pad>',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 ',',
 '!',
 '▁,',
 '?',
 '...',
 '▁நீ',
 '▁',
 '▁என்',
 '்',
 '▁நான்',
 '▁உன்']

In [31]:
from fastai.text.all import Numericalize


In [32]:
from fastai.text.all import BaseTokenizer


In [33]:
class CodeMixedTamilTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str("/content/taen_spm.model"))

    def __call__(self, items):  # Ensure FastAI recognizes it as a callable tokenizer
        return [self.sp.EncodeAsPieces(t) for t in items]

    def tokenizer(self, items):
        return [self.sp.EncodeAsPieces(t) for t in items]

In [34]:
sp = spm.SentencePieceProcessor()
sp.Load(str("/content/taen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(vs)]

In [35]:
taen_vocab = Numericalize(vocab=itos)

In [36]:
custom_pre_rules = [lower_case_everything, handle_all_caps, handle_upper_case_first_letter]

In [37]:
from fastai.text.all import Tokenizer


In [38]:
tokenizer = Tokenizer( CodeMixedTamilTokenizer(lang="taen"),rules=custom_pre_rules)


In [39]:
tokenizer

Tokenizer:
encodes: (Path,object) -> encodes
(str,object) -> encodes
decodes: (object,object) -> decodes

In [41]:
token1 = tokenizer('அவன் தான் வேலைக்கு எழுதி போட்டிருக்கானில்ல.')
token2 = tokenizer('bro saptingala')
print(token1)
print(token2)

['▁அவன்', '▁தான்', '▁வேலைக்கு', '▁எழுதி', '▁போட்டிருக்க', 'ானில்ல', '.']
['▁bro', '▁sa', 'pt', 'ing', 'ala']


In [42]:
sp.EncodeAsPieces("அவன் தான் வேலைக்கு எழுதி போட்டிருக்கானில்ல.")

['▁அவன்', '▁தான்', '▁வேலைக்கு', '▁எழுதி', '▁போட்டிருக்க', 'ானில்ல', '.']