In [1]:
import numpy as np
import re
import pickle as pkl
import pprint as pp
import random
from nltk import edit_distance, FreqDist

In [3]:
with open('../utilities/pickle_files/DIACRITICS.pickle', 'rb') as file:
    DIACRITICS = pkl.load(file)

#double damma, double fatha, double kasera, damma, fatha, kasera, sukoon, shadd
TASHKEEL_SET = {'ٌ', 'ً', 'ٍ', 'ُ', 'َ', 'ِ', 'ْ', 'ٌّ', 'ّ'}
DIACRITICS_REGEX = re.compile('|'.join(TASHKEEL_SET))
def remove_diacritics(data):
    return re.sub(DIACRITICS_REGEX, '', data)

In [4]:
def preprocess(file):
    '''
    Preprocess the data by removing all non arabic characters and diacritics
    '''
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.read()

    pattern = r'\u000a+' 
    result = re.sub(pattern, ' ', lines) # remove end lines
    pattern = r'[^\u0621-\u0655 ]+' 
    result = re.sub(pattern, '', result) # remove non arabic characters
    pattern = r'\s+'
    result = re.sub(pattern, ' ', result) # remove extra spaces

    return result

In [5]:
def createDictionary(input_file):
    '''
    This function takes a file path as input and returns
    a dictionary of undiacritized words as keys and all possible diacritized words as values
    in addition to a unigram frequency distribution of the words in the file
    '''
    # Dictionary to store undiacritized words as keys
    # and diacritized words as values in a set
    word_dict = {}

    # Split the line into words based on spaces
    words = preprocess(input_file).split()

    unigram = FreqDist(words)

    # Process each word
    for word in words:
        # Remove diacritics using the custom function
        undiacritized_word = remove_diacritics(word)
        if undiacritized_word == word:
            continue

        # Add the undiacritized word to the dictionary
        # If the undiacritized word is not in the dictionary, create a new entry
        # Otherwise, update the existing entry with the diacritized word
        if undiacritized_word not in word_dict:
            word_dict[undiacritized_word] = {word}
        else:
            word_dict[undiacritized_word].add(word)

    return word_dict, unigram

In [7]:
def closestWords(possibilities_set, misspelled_word):
    '''
    This function returns a list of the closest words to the misspelled word
    '''
    # check the minimum edit distance between the misspelled word and the possibilities
    min_distance = float("inf")
    closest_words = []

    for word in possibilities_set:
        distance = edit_distance(misspelled_word[:-1], word[:-1])

        if distance < min_distance:
            # Found a new minimum distance, update the list of closest words
            min_distance = distance
            closest_words = [word]
        elif distance == min_distance:
            # Found a tie, add this word to the list of closest words
            closest_words.append(word)

    return closest_words

def wordEndings(word):
    '''
    This function returns the number of diacritics on the last letter of the word
    '''
    count = 0
    if bool(re.match(r"[\u064b-\u0652]{2}", word[-2:])):
        count = 2
    elif bool(re.match(r"[\u064b-\u0652]", word[-1])):
        count = 1
    return count

def matchLastDiacritic(closest_word, misspelled_word):
    '''
    This function matches the last diacritic of the misspelled word with the closest word
    '''
    count_endings_misspelled = wordEndings(misspelled_word)
    count_endings_closest = wordEndings(closest_word)
    # if both words have no diacritics on the last letter, return the closest word
    if count_endings_misspelled == 0 and count_endings_closest == 0:
        most_probable_word = closest_word
    
    # if the misspelled word has no diacritics on the last letter, return the closest word with the last diacritic of the misspelled word
    elif count_endings_misspelled == 0:
        most_probable_word = closest_word[:-count_endings_closest]
    
    # if the closest word has no diacritics on the last letter, return the closest word with the last diacritic of the misspelled word
    elif count_endings_closest == 0:
        most_probable_word = closest_word + misspelled_word[-count_endings_misspelled:]
    
    # if both words have diacritics on the last letter, return the closest word with the last diacritic of the misspelled word
    else:
        most_probable_word = closest_word[:-count_endings_closest] + misspelled_word[-count_endings_misspelled:]
    return most_probable_word

def postProcessDict(word_dict, unigram, misspelled_word):
    '''
    This function takes a dictionary of undiacritized words as keys and all possible diacritized words as values
    in addition to a unigram frequency distribution of the words in the file
    and a misspelled word as input and returns the most probable diacritized word
    '''
    undiacritized = remove_diacritics(misspelled_word)
    if undiacritized in word_dict:
        possibilities_set = word_dict[undiacritized]
        closest_words = closestWords(possibilities_set, misspelled_word)
        if len(closest_words) == 1:
            return matchLastDiacritic(closest_words[0], misspelled_word)
        else:
            most_frequent_word = max(closest_words, key=lambda word: unigram[word])
            if sum(1 for word in closest_words if unigram[word] == unigram[most_frequent_word]) > 1:
                most_frequent_word = random.choice([word for word in closest_words if unigram[word] == unigram[most_frequent_word]])
            return matchLastDiacritic(most_frequent_word, misspelled_word)
    else:
        return misspelled_word

In [9]:
word_dict, unigram = createDictionary('../data/train.txt')


In [10]:
with open(".\utilities\POST_PROCESSING2.pickle", 'wb') as pickle_file:
    # Dump both dictionaries into the pickle file
    pkl.dump((word_dict, unigram), pickle_file)


In [11]:
misspelled_word = "تقدمها"
corrected_word = postProcessDict(word_dict, unigram, misspelled_word)
print("Misspelled word:", misspelled_word)
print("Corrected word:", corrected_word)
print(edit_distance(misspelled_word, corrected_word))


{'تُقَدِّمْهَا', 'تُقَدِّمُهَا', 'تَقَدُّمُهَا', 'تَقَدُّمِهَا', 'تَقَدّمَهَا', 'تَقَدَّمَهَا', 'تُقَدَّمُهَا', 'تَقَدُّمَهَا'}
['تَقَدّمَهَا']
Misspelled word: تقدمها
Corrected word: تَقَدّمَهَا
5


# Post processing val_inference.txt

In [None]:
with open('val_inference1.txt', 'r', encoding='utf-8') as file:
    lines = file.read()
words = lines.split(' ')

for word in words:
    corrected_word = postProcessDict(word_dict, unigram, word)
    with open('val_inference1_postprocessed.txt', 'a', encoding='utf-8') as file:
        file.write(''.join(corrected_word))
        file.write(' ')

In [13]:
print("تقدموا': {'تَقْدَمُوا', 'تَقَدَّمُوا', 'تُقَدِّمُوا'},")

تقدموا': {'تَقْدَمُوا', 'تَقَدَّمُوا', 'تُقَدِّمُوا'},


In [14]:
def highlight_misclassified_words(true_text, predicted_text):
    true_words = true_text.split()
    predicted_words = predicted_text.split()

    highlighted_output = []
    mis = 0

    for line_number, (true_word, predicted_word) in enumerate(zip(true_words, predicted_words), start=1):
        distance = edit_distance(true_word, predicted_word)
        if distance > 0:
            # Mark the misclassified word along with the line number
            highlighted_output.append(f"{true_word} [misclassified as: {predicted_word}]\n")
            mis += distance
        else:
            highlighted_output.append(f"{true_word}")
        

    return ' '.join(highlighted_output), mis

In [15]:
# Read true diacritized text from val.txt
with open('val.txt', 'r', encoding='utf-8') as true_file:
    true_diacritized_text = true_file.read()

with open('val_inference1.txt', 'r', encoding='utf-8') as predicted_file:
    predicted_diacritized_text = predicted_file.read()

# Highlight misclassified words with line numbers
highlighted_output, mis1 = highlight_misclassified_words(true_diacritized_text, predicted_diacritized_text)

# Read predicted diacritized text from val_inference.txt
with open('val_inference1_postprocessed.txt', 'r', encoding='utf-8') as predicted_file:
    predicted_diacritized_text = predicted_file.read()

# Highlight misclassified words with line numbers
highlighted_output, mis2 = highlight_misclassified_words(true_diacritized_text, predicted_diacritized_text)

# Write the highlighted output to a file
with open('misclassified_words_output.txt', 'w', encoding='utf-8') as output_file:
    output_file.write(highlighted_output)


print("DER before postprocessing: ", mis1*100/len(true_diacritized_text))
print("DER after postprocessing : ", mis2*100/len(true_diacritized_text))

DER before postprocessing:  1.2106096934406576
DER after postprocessing :  1.0816278472341154


In [17]:
def highlight_misclassified_words(true_text, predicted_text):
    true_words = true_text.split()
    predicted_words = predicted_text.split()

    highlighted_output = []

    for true_word, predicted_word in zip(true_words, predicted_words):
        if true_word != predicted_word:
            # Check if only the last letter is misclassified
            if len(true_word) > 1 and true_word[:-1] == predicted_word[:-1]:
                # Exclude words where only the last letter is misclassified
                highlighted_output.append(true_word)
            else:
                # Mark the misclassified word
                highlighted_output.append(f"{true_word} [misclassified as: {predicted_word}]\n")
        else:
            highlighted_output.append(true_word)

    return ' '.join(highlighted_output)

# Read true diacritized text from val.txt
with open('val.txt', 'r', encoding="utf-8") as true_file:
    true_diacritized_text = true_file.read()

# Read predicted diacritized text from val_inference.txt
with open('val_inference1_postprocessed.txt', 'r', encoding="utf-8") as predicted_file:
    predicted_diacritized_text = predicted_file.read()

# Highlight misclassified words
highlighted_output = highlight_misclassified_words(true_diacritized_text, predicted_diacritized_text)

# Write the highlighted output to a file
with open('misclassified_words_output.txt', 'w', encoding="utf-8") as output_file:
    output_file.write(highlighted_output)

In [None]:
text = "وَكَانَ الطَّالِبُ الجميل الرائع العظيم عُمَرُ يَلْهُو فِي الْفَصْلِ"

def add_shadda_to_lam_shamsia(text):
    # Regex pattern to match Lam Shamsia followed by specific letters without shadda
    
    pattern = re.compile(r'(?<=ال)[تثدذرزسشصضطظلن](?!(\u0651|\u064e\u0651|\u064f\u0651|\u0650\u0651))')

    # Function to add shadda to the match
    def add_shadda(match):
        return match.group(0) + 'ّ'

    # Apply the regex and replacement
    result = pattern.sub(add_shadda, text)

    return result

def add_sukoon_to_lam_qamaria(text):
    # Regex pattern to match Lam Qamaria followed by specific letters without sukoon
    pattern = re.compile(r'(?<=ال)(?=[أآإبجحخعغفقكمهوي])')

    # Function to add sukoon to the match
    def add_sukoon(match):
        return match.group(0) + 'ْ'

    # Apply the regex and replacement
    result = pattern.sub(add_sukoon, text)

    return result

# Example usage
arabic_text = "الظابط الولد الطَّالِبُ الجميل الرَُائع العظيم عُمَرُ يَلْهُو فِي الْفَصْلِ"
text_with_shadda = add_shadda_to_lam_shamsia(arabic_text)

with open('outputs.txt', '+w', encoding='utf-8') as file:
    file.write(''.join(arabic_text))
    file.write('\n')
    file.write('\n')
    file.write(''.join(text_with_shadda))