Filter words in the transliteration dictionary based on the BERT vocabulary.

In [None]:
import ast
from transformers import AutoTokenizer
import json

# Load the tokenizer and vocab
model_directory = "Ransaka/sinhala-bert-medium-v2"
tokenizer = AutoTokenizer.from_pretrained(model_directory)
bert_vocab = set(tokenizer.vocab.keys())

# Input/output files
# Define input and output file paths
input_file = "E:/4th Year/FYP/IMPLEMENTATION/data/dictionary.txt"
output_file = "filtered_sinhala_words.txt"

# Helper: check if word's tokens are all in BERT vocab
def is_word_in_vocab(word, vocab):
    tokens = tokenizer.tokenize(word)
    return all(token in vocab for token in tokens)

# Dictionary parsing and filtering
with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
    for line_number, line in enumerate(fin, 1):
        if ':' not in line:
            print(f"Skipping line {line_number} (no colon): {line.strip()}")
            continue
        try:
            key, value = line.strip().split(':', 1)
            key = key.strip()
            sinhala_words = ast.literal_eval(value.strip())
            if not isinstance(sinhala_words, list):
                print(f"Skipping line {line_number} (value is not a list): {line.strip()}")
                continue
            # Filter words
            filtered_words = [word for word in sinhala_words if is_word_in_vocab(word, bert_vocab)]
            if filtered_words:
                # Write in original format
                fout.write(f"{key}: {json.dumps(filtered_words, ensure_ascii=False)}\n")
        except Exception as e:
            print(f"Error on line {line_number}: {line.strip()} -> {e}")


- filter the output file using madhura and get "Filtered_Sinhala_Dictionary.txt

- Since some madura api failed for several words, those were manually checked whether is there any spelling mistakes -> manually_checked_madura_filtered.txt


- Got the modified_lemma.txt file which contains Sinhala words and its lemma using verified_word_list_lemma_analysis.txt

In [None]:
# Function to reformat the lemma analysis data
def reformat_lemma_analysis(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            # Split the line into lemma, frequency, and words
            lemma_part, words_part = line.strip().split(':', 1)
            lemma = lemma_part.strip()
            words = words_part.split('[')[1].split(']')[0].replace("'", "").split(', ')
            
            # Write each word with its corresponding lemma to the output file
            for word in words:
                outfile.write(f"{word.strip()}: {lemma}\n")

# Input and output file paths
input_file = 'E:/4th Year/FYP/IMPLEMENTATION/WSD/verified_word_list_lemma_analysis.txt'
output_file = 'E:/4th Year/FYP/IMPLEMENTATION/WSD/modified_lemma.txt'

# Reformat the lemma analysis data
reformat_lemma_analysis(input_file, output_file)

print(f"Reformatted data has been written to {output_file}")

- Got the all_unique_words_in_verified_lemma file by using the words (not lemmas) from verified_word_list_lemma_analysis.txt file


In [None]:
# Function to extract unique Sinhala words
def extract_unique_words(input_file, output_file):
    unique_words = set()  # Use a set to store unique words

    with open(input_file, 'r', encoding='utf-8') as infile:
        for line in infile:
            # Split the line into lemma and words
            lemma_part, words_part = line.strip().split(':', 1)
            lemma = lemma_part.strip()
            words = words_part.split('[')[1].split(']')[0].replace("'", "").split(', ')
            
            # Add words to the set
            # unique_words.add(lemma)
            for word in words:
                unique_words.add(word.strip())

    # Write the unique words to the output file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for word in sorted(unique_words):  # Sort the words alphabetically
            outfile.write(f"{word}\n")

# Input and output file paths
input_file = 'E:/4th Year/FYP/IMPLEMENTATION/WSD/verified_word_list_lemma_analysis.txt'
output_file = 'E:/4th Year/FYP/IMPLEMENTATION/WSD/all_unique_words_in_verified_lemma.txt'

# Extract unique words
extract_unique_words(input_file, output_file)

print(f"Unique Sinhala words have been written to {output_file}")

- filter the manually filtered dictionary Filtered_Sinhala_Dictionary and the all_unique_words_in_verified_lemma

- That output file was filtered again with the words in all_unique_words_in_verified_lemma.file -> dictionary_all_unique_verified_filtered.txt


In [None]:
# Read the word list into a set for fast lookup
with open('E:/4th Year/FYP/FYP-TEST/all_unique_words_in_verified_lemma.txt', 'r', encoding='utf-8') as file:
    word_list = set(line.strip() for line in file)

# Read the dictionary file and process it
filtered_dictionary = {}
with open('E:/4th Year/FYP/FYP-TEST/manually_checked_madura_filtered.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Split the line into the Romanized word and the list of Sinhala words
        romanized_part, sinhala_part = line.strip().split(':')
        romanized_word = romanized_part.strip()
        sinhala_words = eval(sinhala_part.strip())  # Convert the string representation of the list to an actual list

        # Filter the Sinhala words to keep only those present in the word list
        filtered_sinhala_words = [word for word in sinhala_words if word in word_list]

        # If there are any filtered words left, add them to the filtered dictionary
        if len(filtered_sinhala_words)>1:
            filtered_dictionary[romanized_word] = filtered_sinhala_words

# Write the filtered dictionary to a new file
with open('E:/4th Year/FYP/FYP-TEST/dictionary_all_unique_verified_filtered.txt', 'w', encoding='utf-8') as file:
    for romanized_word, sinhala_words in filtered_dictionary.items():
        file.write(f"{romanized_word}: {sinhala_words}\n")

print("Filtered dictionary has been saved to 'Filtered-Romanized-Sinhala-Sinhala-Dictionary.txt'.")

- Created the lemmatized_final_dictionary.txt file by filtering the dictionary_all_unique_verified_filtered.txt with modified_lemma.txt


- create the lemmatized dictimonary considering the filtered(bert,dict,wordlist) and the modified lemma

In [None]:
from collections import defaultdict
import ast

def load_dictionary(file_path):
    """Load the Romanized Sinhala to Sinhala dictionary from a text file."""
    dictionary = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            key, value = line.strip().split(': ', 1)
            dictionary[key] = ast.literal_eval(value)  # Convert string list to actual list
    return dictionary

def load_lemmas(file_path):
    """Load the Sinhala words and their lemmas from a text file."""
    lemmas = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            word, lemma = line.strip().split(': ')
            lemmas[word] = lemma
    return lemmas

def group_by_lemma(dictionary, lemmas):
    """Group Sinhala words with the same lemma into tuples within the dictionary."""
    grouped_dict = {}
    
    for key, words in dictionary.items():
        lemma_groups = defaultdict(list)
        
        for word in words:
            # print(f"Processing word: {word}")
            lemma = lemmas.get(word, word)  # Use word itself if lemma not found
            # print(f"Lemma: {lemma}")
            lemma_groups[lemma].append(word)
            # print(f"Lemma groups: {lemma_groups}")
        
        # Convert lists to tuples if they contain more than one word
        processed_words = []
        single_words = []
        for group in lemma_groups.values():
            if len(group) > 1:
                processed_words.append(tuple(group))
            else:
                single_words.extend(group)
        
        grouped_dict[key] = single_words + processed_words  # Ensure tuples are at the end
    
    return grouped_dict

def save_dictionary(file_path, dictionary):
    """Save the processed dictionary to a text file."""
    with open(file_path, 'w', encoding='utf-8') as file:
        for key, value in dictionary.items():
            file.write(f"{key}: {value}\n")

# File paths
romanized_sinhala_dict_path = 'E:/4th Year/FYP/FYP-TEST/dictionary_all_unique_verified_filtered.txt'
lemma_file_path = 'E:/4th Year/FYP/FYP-TEST/modified_lemma.txt'
output_file_path = 'E:/4th Year/FYP/FYP-TEST/lemmatized_final_dictionary.txt'

# Load data
dictionary = load_dictionary(romanized_sinhala_dict_path)
lemmas = load_lemmas(lemma_file_path)

# Process and group by lemma
processed_dictionary = group_by_lemma(dictionary, lemmas)

# Save output
save_dictionary(output_file_path, processed_dictionary)

print("Processed dictionary saved to", output_file_path)

- single element Sinhala lists were removed in lemmatized_final_dictionary.txt -> manually_checked_lemmatized_final_dictionary.txt (I dont remember why I used the "manually")


code to filter the dictionary by removing words which less than 2 Sinhala words

In [None]:

# Define the input and output file names
input_file = "E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/dic_verified_word_list2.txt"
output_file = 'E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/final_dictionary_2.txt'

# Open the input file for reading and the output file for writing
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    # Iterate over each line in the input file
    for line in infile:
        # Split the line into the Romanized Sinhala word and the corresponding Sinhala words
        romanized_word, sinhala_words = line.strip().split(':')
        
        # Evaluate the Sinhala words as a Python list
        sinhala_words_list = eval(sinhala_words.strip())
        
        # Check if the Sinhala words list has at least 1 element
        if len(sinhala_words_list) == 3:
            # Write the line to the output file if it meets the condition
            outfile.write(line)

print(f"Filtered dictionary has been saved to {output_file}")

- expand the variations of tuples in manually_checked_lemmatized_final_dictionary.txt -> manually_checked_lemmatized_final_dictionary2.txt


In [None]:
import ast
import itertools


input_file = 'E:/4th Year/FYP/FYP-TEST/manually_checked_lemmatized_final_dictionary.txt'
output_file = 'E:/4th Year/FYP/FYP-TEST/manually_checked_lemmatized_final_dictionary2.txt'


# input_file = 'E:/4th Year/FYP/FYP-TEST/t1.txt'
# output_file = 'E:/4th Year/FYP/FYP-TEST/t2.txt'

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8') as outfile:
    
    for line in infile:
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        
        # Split the line into key and the list part
        key_part, list_part = line.split(':', 1)
        key = key_part.strip()
        list_str = list_part.strip()
        
        try:
            # Parse the list using ast.literal_eval
            original_list = ast.literal_eval(list_str)
        except:
            print(f"Skipping line due to parsing error: {line}")
            continue
        
        # Prepare options for each element in the original list
        options = []
        for elem in original_list:
            if isinstance(elem, tuple):
                # Convert tuple to a list of its elements
                options.append(list(elem))
            else:
                # Treat string as a single-element list
                options.append([elem])
        
        # Generate all possible combinations using Cartesian product
        for combination in itertools.product(*options):
            # Convert the combination tuple to a list for representation
            combination_list = list(combination)
            # Write the line to the output file
            outfile.write(f"{key}: {repr(combination_list)}\n")

- created the dictionary_2 and dictionary_3 by deviding the manually_checked_lemmatized_final_dictionary2.txt into dictionaries which has 2 Sinhala words ad 3.


In [None]:
# # Define the input and output file names
# input_file = "E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/dic_verified_word_list2.txt"
# output_file = 'E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/final_dictionary_2.txt'

# # Open the input file for reading and the output file for writing
# with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
#     # Iterate over each line in the input file
#     for line in infile:
#         # Split the line into the Romanized Sinhala word and the corresponding Sinhala words
#         romanized_word, sinhala_words = line.strip().split(':')
        
#         # Evaluate the Sinhala words as a Python list
#         sinhala_words_list = eval(sinhala_words.strip())
        
#         # Check if the Sinhala words list has at least 1 element
#         if len(sinhala_words_list) == 3:
#             # Write the line to the output file if it meets the condition
#             outfile.write(line)

# print(f"Filtered dictionary has been saved to {output_file}")

- calculating and sorting frequencies for each Romanized Word

In [None]:
# import ast

# # Read Sinhala word frequencies from word_list.si
# frequency_dict = {}
# with open('E:/4th Year/FYP/FYP-TEST/verified_word_list_200K.si', 'r', encoding='utf-8') as f:
#     for line in f:
#         parts = line.strip().split()
#         if len(parts) < 2:
#             continue
#         word = parts[0]
#         try:
#             freq = int(parts[1])
#             frequency_dict[word] = freq
#         except (IndexError, ValueError):
#             continue

# # Read Romanized-Sinhala dictionary and process each entry
# entries = []
# with open('E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/dictionary_3.txt', 'r', encoding='utf-8') as f:
#     for line in f:
#         line = line.strip()
#         if not line or ':' not in line:
#             continue
#         roman_part, sinhala_part = line.split(':', 1)
#         roman = roman_part.strip()
#         sinhala_part = sinhala_part.strip()
#         try:
#             sinhala_list = ast.literal_eval(sinhala_part)
#             if not isinstance(sinhala_list, list):
#                 continue
#         except (SyntaxError, ValueError):
#             continue
#         # Calculate total frequency for this entry
#         total = sum(frequency_dict.get(word, 0) for word in sinhala_list)
#         entries.append((roman, sinhala_list, total))

# # Sort entries by descending total, then Roman word, then Sinhala list
# sorted_entries = sorted(entries, key=lambda x: (-x[2], x[0], str(x[1])))

# # Write the sorted entries to a file
# with open('E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/frequency_dict3.txt', 'w', encoding='utf-8') as f:
#     for roman, sinhala_list, freq in sorted_entries:
#         sinhala_str = str(sinhala_list)  # Converts list to string representation
#         f.write(f"{roman}: {sinhala_str}: {freq}\n")

- remove word from dictionary which is not present n the verified word lists

In [None]:
import ast

# Load the frequency list from word_list.si into a set
def load_frequency_list(freq_file):
    freq_words = set()
    with open(freq_file, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip().split()[0]  # Extract the word (first column)
            freq_words.add(word)
    return freq_words

# Load and filter the Romanized Sinhala dictionary
def filter_dictionary(dict_file, freq_words, output_file):
    filtered_dict = {}
    
    with open(dict_file, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue  # Skip empty lines
            
            key, value = line.split(':', 1)  # Split only on the first colon
            key = key.strip()
            value = ast.literal_eval(value.strip())  # Convert string list to actual list
            
            # Filter words based on frequency list
            filtered_values = [word for word in value if word in freq_words]
            
            if filtered_values:
                filtered_dict[key] = filtered_values
    
    # Save the filtered dictionary
    with open(output_file, 'w', encoding='utf-8') as f:
        for key, values in filtered_dict.items():
            f.write(f"{key}: {values}\n")
    
    print(f"Filtered dictionary saved to {output_file}")

# File paths
romanized_dict_file = "E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/dictionary_3.txt"
frequency_list_file = "E:/4th Year/FYP/FYP-TEST/verified_word_lists/verified_word_list_3K.si"
output_file = "E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/dic_verified_word_list3.txt"


# Process files
freq_words = load_frequency_list(frequency_list_file)
filter_dictionary(romanized_dict_file, freq_words, output_file)


In [None]:
# Define the input and output file names
input_file = "E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/dic_verified_word_list3.txt"
output_file = 'E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/final_dictionary_3.txt'

# Open the input file for reading and the output file for writing
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    # Iterate over each line in the input file
    for line in infile:
        # Split the line into the Romanized Sinhala word and the corresponding Sinhala words
        romanized_word, sinhala_words = line.strip().split(':')
        
        # Evaluate the Sinhala words as a Python list
        sinhala_words_list = eval(sinhala_words.strip())
        
        # Check if the Sinhala words list has at least 1 element
        if len(sinhala_words_list) > 1:
            # Write the line to the output file if it meets the condition
            outfile.write(line)

print(f"Filtered dictionary has been saved to {output_file}")

- code to make dictionary with together the Singlish words with similar Sinhala word lists

In [None]:
from collections import defaultdict

def normalize_word_list(word_list):
    """Normalize a word list by sorting it to handle order variations."""
    return tuple(sorted(word_list))

def read_dictionary(file_path):
    """Read the dictionary file and return a grouped dictionary."""
    grouped_dict = defaultdict(list)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            romanized, sinhala_str = line.split(':')
            sinhala_words = eval(sinhala_str.strip())  # Convert string to list
            
            normalized_key = normalize_word_list(sinhala_words)
            grouped_dict[normalized_key].append(romanized.strip())
    
    return grouped_dict

def write_grouped_dictionary(output_path, grouped_dict):
    """Write the grouped dictionary to a text file in the required format."""
    with open(output_path, 'w', encoding='utf-8') as f:
        for sinhala_words, romanized_list in grouped_dict.items():
            f.write(f"{', '.join(romanized_list)}: {list(sinhala_words)}\n")

# Example usage
file_path = "E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/dictionary_3.txt"
output_path = "E:/4th Year/FYP/FYP-TEST/lemmatized_dictionaries/grouped_final_dictionary_3.txt"
grouped_dictionary = read_dictionary(file_path)
write_grouped_dictionary(output_path, grouped_dictionary)

# Print result
with open(output_path, 'r', encoding='utf-8') as f:
    print(f.read())

- manually clean the file "grouped_final_dictionary_3"