<div class="alert alert-block alert-info">

# Pre-process abbreviations in Tamil 
</div>

In [1]:
import re
tamil_other_abbreviation_pattern = re.compile(r'(?:[\u0B80-\u0BFF]{1,3}\.){2,}')
tamil_month_abbreviations = [r'ஜன\.', r'ஜனவரி\.', r'பிப்\.', r'பிப்ரவரி\.', r'மார்ச்\.', r'ஏப்\.', r'ஏப்ரல்\.', r'மே\.', r'ஜூன்\.', r'ஜூலை\.', r'ஆக\.', r'ஆகஸ்ட்\.', r'ஆகஸ்டு\.', r'ஆகத்து\.', r'செப்\.', r'செப்டம்பர்\.' , r'அக்\.', r'அக்டோபர்\.', r'நவ\.', r'நவம்பர்\.', r'டிச\.', r'டிசம்பர்\.']
sentence_endings = re.compile(r'(?<=[.!?])\s')
tamil_month_abbreviation_pattern = re.compile(r'|'.join(tamil_month_abbreviations))

In [2]:
def processAbbreviations(chunk):
    """
    Function to process abbreviations within a chunk of text.
    Ensures that abbreviations are not misinterpreted as sentence boundaries.
    """
   
    month_matches = list(tamil_month_abbreviation_pattern.finditer(chunk))
    #print(month_matches)
    other_matches = list(tamil_other_abbreviation_pattern.finditer(chunk))
    matches = month_matches+other_matches
    #print(matches)
    for match in matches:
        
        abbr = match.group(0)
        #print(abbr)
        chunk = chunk.replace(abbr, abbr.replace('.', '<<PERIOD>>'))

    return chunk

<div class="alert alert-block alert-warning">

### Process abbreviations and write every sentence in a new line of a df.

</div>

In [3]:
s = "1980ஆம் ஆண்டு வாழ. 80 பரிசுகள் கொண்டு வா."
st = re.sub('\d+', '<NUM>', s)
print(st)

<NUM>ஆம் ஆண்டு வாழ. <NUM> பரிசுகள் கொண்டு வா.


In [4]:
unwanted_chars_pattern = re.compile(r'[\u200B-\u200D\uFEFF\uFFFC]|[^\w\s.\u0B80-\u0BFF]')
def read_tamil_sentences(file_path, output_file, chunk_size):  
    with open(file_path, 'r', encoding='utf-8') as file, open(output_file, 'w', encoding='utf-8') as out_file:
        while True:
            
            chunk = file.read(chunk_size)

            if not chunk:  
                break

            chunk = processAbbreviations(chunk)
            sentences = sentence_endings.split(chunk)

            
            for sentence in sentences:
                if sentence:  
                    sent = re.sub('\d+', '<NUM>', sentence)
                    sent = sent.replace("�","")
                    sent = re. sub(unwanted_chars_pattern,'',sent)
                    out_file.write(sent + '\n')

In [5]:
import time 
start = time.time()
file_path = 'E:\\CORPUS WITH TXT AND TEI XML FILES\\TamilCorpus.txt'  
output_file = 'tamilSentences.csv'  
chunk_size = 1024*1024 # In 1024 KB or 1 MB chunks
read_tamil_sentences(file_path, output_file, chunk_size)
print(f"Sentences have been written to {output_file}")
end = time.time()
print("The time of execution of above program is : ",(end-start) * 10**3, "ms")

Sentences have been written to tamilSentences.csv
The time of execution of above program is :  3160204.908847809 ms


<div class="alert alert-block alert-info">

# Build a Tamil dictionary  
</div>

In [4]:
from langid.langid import LanguageIdentifier, model
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

In [11]:
import pandas as pd
import json
import re

import time 
start = time.time()
# Load the CSV file
csv_file = "tamilSentences.csv"  
df = pd.read_csv(csv_file)
sentences = df.iloc[:, 0].dropna().tolist()
uniqueTamilWords = set()
for sentence in sentences:
    words = sentence.split()
    uniqueTamilWords.update(words)
wordList = sorted(uniqueTamilWords)
json_file = "E:\\tamilDictionary.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(wordList, f, ensure_ascii=False, indent=4)
print(f"Unique Tamil words saved to {json_file}")
end = time.time()
print("The time of execution of above program is : ",(end-start) * 10**3, "ms")


Unique Tamil words saved to E:\tamilDictionary.json
The time of execution of above program is :  841787.9774570465 ms


In [12]:
import json
import re


def process_line(line):
    # 1) Remove all dots (.)
    line = line.replace(".", "")
    
    # 2) Remove all English characters except "NUM" and "PERIOD"
    # Allow "NUM" and "PERIOD" (any number of occurrences) and remove everything else
    line = re.sub(r"(?!NUM|PERIOD)[A-Za-z]", "", line)
    
    # Remove any extra spaces that might result
    return line.strip()

# File paths
input_file = "E:\\tamilDictionary.json"  # Replace with your input JSON file
output_file = "E:\\output.json"  # Replace with your desired output JSON file

# Process the JSON file
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    output_lines = []
    for line in infile:
        line = line.strip()  # Remove leading/trailing whitespace
        if line:  # Process non-empty lines
            processed_line = process_line(line)
            output_lines.append(processed_line)
    
    # Write all processed lines to the output file in JSON format
    json.dump(output_lines, outfile, ensure_ascii=False, indent=4)

print(f"Processed lines saved to {output_file}")

Processed lines saved to E:\output.json


<div class="alert alert-block alert-info">

# Build a Word2Vec model 
</div>

In [40]:
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize
import string

### Step 1: Load the Preprocessed Data

In [41]:
file_path = 'D:\\Preprocessed\\TamilCorpusSampled615.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text_data = file.read()

### Step 2: Tokenize Sentences with indic-nlp-library

In [42]:
from indicnlp.tokenize import indic_tokenize

sentences = text_data.splitlines()  
tokenized_sentences = [list(indic_tokenize.trivial_tokenize(sentence, lang='ta')) for sentence in sentences]

### Step 3: Train the Word2Vec Model (skipgram model)

In [43]:
from gensim.models import Word2Vec

# vector_size: Each word will be represented as a dense vector with 200 components.
# window: The model considers 5 words before and 5 words after a target word to predict word relationships.
# min_count: Ignores words that appear fewer than 2 times in the corpus.
# workers: Specifies the number of worker threads to use for parallel training.
# sg=1: Skip-Gram model (focuses on predicting context words from a target word).
sg_model = Word2Vec(sentences=tokenized_sentences, vector_size=300, window=5, min_count=2, workers=4, sg=1)
sg_model.save("E:\\word2Vec model\\tamilWord2vec_SG.model")
print(f"total words: {sg_model.corpus_total_words}")
# Display the top 50 words in the vocabulary
top_50_words = list(sg_model.wv.index_to_key[1:51])
print("Top 50 words in the vocabulary:", top_50_words)

total words: 4149013
Top 50 words in the vocabulary: ['ஒரு', 'என்று', 'மற்றும்\u200c', 'மொத்தம்\u200c', 'இந்த', 'அல்லது', 'அரசு', 'வேண்டும்\u200c', 'என்ற', 'அந்த', 'கொண்டு', 'மற்றும்', 'உள்ள', 'ரூ', 'இது', 'பழங்குடியினர்\u200c', 'என்பது', 'பல', 'என', 'செய்து', 'சென்னை', 'வரவுகள்\u200c', 'மாவட்ட', 'ஏனைய', 'தனது', 'அது', 'போன்ற', 'மதிப்பீடு', 'வேண்டும்', 'இருந்து', 'விடுதி', 'பெருந்தலைப்பு', 'ஆதிதிராவிடர்\u200c', 'என்ன', 'திட்ட', 'அரியலூர்', 'பிரிவு', 'போது', 'நல', 'கல்வி', 'இல்லை', 'துறை', 'துணை', 'சில', 'கொண்ட', 'திரு', 'நான்\u200c', 'மூலம்\u200c', 'அவர்', 'தான்']


### Step 3.1: Train the Word2Vec Model (CBOW model)

In [44]:
# sg=0: CBOW model - Continuous Bag of Words, predicts a target word from surrounding context words
cbow_model = Word2Vec(sentences=tokenized_sentences, vector_size=200, window=5, min_count=2, workers=4, sg=0)
cbow_model.save("E:\\word2Vec model\\tamilWord2vec_CBOW.model")
print(f"total words: {cbow_model.corpus_total_words}")
# Display the top 50 words in the vocabulary
top_50_words = list(cbow_model.wv.index_to_key[1:51])
print("Top 50 words in the vocabulary:", top_50_words)

total words: 4149013
Top 50 words in the vocabulary: ['ஒரு', 'என்று', 'மற்றும்\u200c', 'மொத்தம்\u200c', 'இந்த', 'அல்லது', 'அரசு', 'வேண்டும்\u200c', 'என்ற', 'அந்த', 'கொண்டு', 'மற்றும்', 'உள்ள', 'ரூ', 'இது', 'பழங்குடியினர்\u200c', 'என்பது', 'பல', 'என', 'செய்து', 'சென்னை', 'வரவுகள்\u200c', 'மாவட்ட', 'ஏனைய', 'தனது', 'அது', 'போன்ற', 'மதிப்பீடு', 'வேண்டும்', 'இருந்து', 'விடுதி', 'பெருந்தலைப்பு', 'ஆதிதிராவிடர்\u200c', 'என்ன', 'திட்ட', 'அரியலூர்', 'பிரிவு', 'போது', 'நல', 'கல்வி', 'இல்லை', 'துறை', 'துணை', 'சில', 'கொண்ட', 'திரு', 'நான்\u200c', 'மூலம்\u200c', 'அவர்', 'தான்']


### Step 4: Analyze word similarities (An example using skipgram model)

In [45]:
word = 'சட்ட'  
if word in sg_model.wv.key_to_index:
    similar_words = sg_model.wv.most_similar(word, topn=15)
    print(f"Words most similar to '{word}':")
    for sim_word, score in similar_words:
        print(f"{sim_word}: {score}")
else:
    print(f"Word '{word}' not found in the vocabulary.")

Words most similar to 'சட்ட':
இந்திய: 0.9664392471313477
குற்றவியல்‌: 0.9620451927185059
செலவினமாகும்‌: 0.9597854018211365
விளக்கம்‌: 0.9593039751052856
பாதிக்கப்பட்டவருக்கு: 0.9582811594009399
சட்டப்‌: 0.9575212597846985
தண்டனைச்‌: 0.956930935382843
நுணுக்க: 0.9562482237815857
சட்டநடைமுறைகளில்‌: 0.9562100768089294
சாசன: 0.952883243560791
குற்றத்தினால்‌: 0.9522219300270081
நாணயம்‌: 0.9518848061561584
விரோதமான: 0.9510866403579712
தேவைப்‌: 0.9506672024726868
சம்பந்தப்பட்டவர்கள்‌: 0.9494932889938354


### Step 4.1: Analyze word similarities (An example using CBOW model)

In [46]:
word = 'துறை'  
if word in cbow_model.wv.key_to_index:
    similar_words = cbow_model.wv.most_similar(word, topn=15)
    print(f"Words most similar to '{word}':")
    for sim_word, score in similar_words:
        print(f"{sim_word}: {score}")
else:
    print(f"Word '{word}' not found in the vocabulary.")

Words most similar to 'துறை':
கணக்கு: 0.9991927146911621
தலைவர்‌: 0.9989467859268188
முதலமைச்சர்‌: 0.9986269474029541
மாண்புமிகு: 0.998553991317749
செயலாளரின்‌: 0.9984923601150513
அமைச்சர்‌: 0.9979434609413147
செயலகம்‌: 0.9976032376289368
திருவள்ளூர்‌: 0.9974076151847839
கழகம்‌: 0.9972704648971558
தாட்கோ: 0.9969611763954163
இயக்குநருக்கு: 0.9969244599342346
கட்டடங்கள்‌: 0.9967366456985474
அமைச்சரின்‌: 0.9967193603515625
இருப்புக்‌: 0.9967154860496521
அலுவலகம்‌: 0.9966514706611633
