#Step 1:Install Necessary Libraries

You might want to use libraries like nltk (Natural Language Toolkit) for handling tokenization. You can install it using:

In [None]:
pip install nltk

#Step 2: Write a Basic Tokenizer for Sinhala/Tamil
A simple tokenizer splits text into words based on spaces or punctuation. Since Sinhala and Tamil have specific punctuation rules, this basic version will handle standard punctuation and spaces.

Here’s a basic Python tokenizer:

In [11]:
import re

# A sample text in Sinhala
sinhala_text = "සංජුල ඔබට කෙසේද? මම නම් කාර්යබහුලයි."

# A sample text in Tamil
tamil_text = "நீங்கள் எப்படி இருக்கிறீர்கள்? நான் பிஸியாக இருக்கிறேன்."

# Basic word-based tokenizer function
def basic_tokenizer(text):
    # Split words based on spaces and remove punctuation
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Tokenize the Sinhala text
sinhala_tokens = basic_tokenizer(sinhala_text)
print("Sinhala Tokens:", sinhala_tokens)

# Tokenize the Tamil text
tamil_tokens = basic_tokenizer(tamil_text)
print("Tamil Tokens:", tamil_tokens)


Sinhala Tokens: ['ස', 'ජ', 'ල', 'ඔබට', 'ක', 'ස', 'ද', 'මම', 'නම', 'ක', 'ර', 'යබහ', 'ලය']
Tamil Tokens: ['ந', 'ங', 'கள', 'எப', 'பட', 'இர', 'க', 'க', 'ற', 'ர', 'கள', 'ந', 'ன', 'ப', 'ஸ', 'ய', 'க', 'இர', 'க', 'க', 'ற', 'ன']


#Step 3: Explanation of Tokenization Process

    Regex: The regular expression \b\w+\b is used to match word boundaries, ensuring that the text is split into words and symbols are removed.
Output: You’ll see the input sentence broken down into individual tokens.


<--- The issue arises because your regular expression \b\w+\b is tailored for Latin-based languages and doesn't handle complex scripts like Sinhala or Tamil correctly. In Sinhala and Tamil, characters often combine into ligatures or syllables, and those combined forms may not be recognized as single words by the regular expression.--->

In [10]:
import re

# A sample text in Sinhala
sinhala_text = "සංජුල ඔබට කෙසේද? මම නම් කාර්යබහුලයි."

# A sample text in Tamil
tamil_text = "நீங்கள் எப்படி இருக்கிறீர்கள்? நான் பிஸியாக இருக்கிறேன்."

# Basic word-based tokenizer function for Sinhala and Tamil
def basic_tokenizer(text):
    # Split words based on Unicode ranges for Sinhala and Tamil scripts
    tokens = re.findall(r'[\u0D80-\u0DFF]+|[\u0B80-\u0BFF]+|\w+', text)
    return tokens

# Tokenize the Sinhala text
sinhala_tokens = basic_tokenizer(sinhala_text)
print("Sinhala Tokens:", sinhala_tokens)

# Tokenize the Tamil text
tamil_tokens = basic_tokenizer(tamil_text)
print("Tamil Tokens:", tamil_tokens)


Sinhala Tokens: ['සංජුල', 'ඔබට', 'කෙසේද', 'මම', 'නම්', 'කාර්යබහුලයි']
Tamil Tokens: ['நீங்கள்', 'எப்படி', 'இருக்கிறீர்கள்', 'நான்', 'பிஸியாக', 'இருக்கிறேன்']


#Step 4: Convert Tokens to Numerical Representation

    You can assign each token a unique index number. This is the simplest numerical representation, where each token corresponds to a unique integer.

Here’s how you can implement this in Python:


In [14]:
# Token list from the previous step
tokens = ["සංජුල","ඔබට", "කෙසේද", "මම", "නම්", "කාර්යබහුලයි"]

# Create a dictionary to assign an index to each token
token_to_id = {token: idx for idx, token in enumerate(tokens)}

# Convert tokens to their numerical representation
numerical_representation = [token_to_id[token] for token in tokens]

print("Token to Numerical Representation:", numerical_representation)


Token to Numerical Representation: [0, 1, 2, 3, 4, 5]


#Step 2: Binary Representation of Tokens
    
    To convert these numerical values into binary (or a form that the computer can understand), we can use Python's built-in bin() function to get binary representations.

In [15]:
# Convert numerical representation to binary
binary_representation = [bin(token_id) for token_id in numerical_representation]

print("Token to Binary Representation:", binary_representation)


Token to Binary Representation: ['0b0', '0b1', '0b10', '0b11', '0b100', '0b101']


Step 3: Word Embedding Model (Word2Vec)

    For a more advanced approach, you can use Word2Vec, which converts each word (token) into a vector of real numbers (embedding). This captures the semantic meaning of words. To implement Word2Vec, you can use the gensim library:

In [None]:
pip install gensim

In [None]:
from gensim.models import Word2Vec

# Sample Sinhala/Tamil sentences tokenized
sentences = [["සංජුල","ඔබට", "කෙසේද"], ["මම", "නම්", "කාර්යබහුලයි"]]

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=10, window=5, min_count=1, workers=4)

# Get vector representation of every token

word_vector = model.wv["සංජුල"]
print("Vector representation for 'සංජුල':", word_vector, "\n")

word_vector = model.wv["ඔබට"]
print("Vector representation for 'ඔබට':", word_vector, "\n")

word_vector = model.wv["කෙසේද"]
print("Vector representation for 'කෙසේද':", word_vector, "\n")

word_vector = model.wv["මම"]
print("Vector representation for 'මම':", word_vector, "\n")

word_vector = model.wv["නම්"]
print("Vector representation for 'නම්':", word_vector, "\n")

word_vector = model.wv["කාර්යබහුලයි"]
print("Vector representation for 'කාර්යබහුලයි':", word_vector, "\n")