In [96]:
import re
import os 
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import random as rnd

In [97]:
def read_data(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        # train_set = f.read().splitlines()
        return f.read()

In [98]:
# filter data takes a list of strings and removes unwanted patterns
def filter_data(data: str) -> str:
    # data = re.sub(r"\( \d+ (/ \d+)? \)", "", data)
    # remove all numbers
    data = re.sub(r"\d+", "", data)
    # regex to remove all special characters
    data = re.sub(r"[][//,;\?؟()$:\-{}_*؛:«»`–\"~!]", "", data)
    # remove all english letters
    data = re.sub(r"[a-zA-Z]", "", data)
    # Substituting multiple spaces with single space
    data = re.sub(r"([^\S\n])+", " ", data, flags=re.I)
    return data

In [99]:
def split_data_to_words(data: str) -> list:
    words = re.split(r"\s+", data)
    return words

In [100]:
# Define Diacritics
KASRA = "\u0650"
DAMMA = "\u064F"
FATHA = "\u064E"
KASRATAN = "\u064D"
DAMMATAN = "\u064C"
FATHATAN = "\u064B"
SUKUN = "\u0652"
SHADDA = "\u0651"
SHADDA_DAMMA =  DAMMA + SHADDA 
SHADDA_FATHA =  FATHA + SHADDA 
SHADDA_KASRA =  KASRA + SHADDA 
SHADDA_DAMMATAN =  DAMMATAN + SHADDA
SHADDA_FATHATAN =  FATHATAN + SHADDA 
SHADDA_KASRATAN =  KASRATAN + SHADDA  
EMPTY = ""
DIACRITICS = [DAMMA, FATHA,  KASRA, DAMMATAN, FATHATAN, KASRATAN, SHADDA_DAMMA, SHADDA_FATHA,  SHADDA_KASRA, SHADDA_DAMMATAN, SHADDA_FATHATAN, SHADDA_KASRATAN, SHADDA, SUKUN, EMPTY]
ARABIC_ALPHABIT = "اأآإئءبتةثجحخدذرزسشصضطظعغفقكلمنهوؤيى"

In [101]:
# This function is responsible for mapping diacritics to their corresponding strings
def diacritic_to_str(diacritic):
    if diacritic == SHADDA:
        diacritic = "SHADDA"
    elif diacritic == KASRA:
        diacritic = "KASRA"
    elif diacritic == DAMMA:
        diacritic = "DAMMA"
    elif diacritic == FATHA:
        diacritic = "FATHA"
    elif diacritic == KASRATAN:
        diacritic = "KASRATAN"
    elif diacritic == DAMMATAN:
        diacritic = "DAMMATAN"
    elif diacritic == FATHATAN:
        diacritic = "FATHATAN"
    elif diacritic == SUKUN:
        diacritic = "SUKUN"
    elif diacritic == DAMMA + SHADDA or diacritic == SHADDA +DAMMA :
        diacritic = "SHADDA_DAMMA"
    elif diacritic == FATHA + SHADDA or diacritic == SHADDA +FATHA :
        diacritic = "SHADDA_FATHA"
    elif diacritic == KASRA + SHADDA or diacritic == SHADDA + KASRA:
        diacritic = "SHADDA_KASRA"
    elif diacritic == DAMMATAN + SHADDA or diacritic == SHADDA + DAMMATAN:
        diacritic = "SHADDA_DAMMATAN"
    elif diacritic == FATHATAN + SHADDA or diacritic == SHADDA + FATHATAN:
        diacritic = "SHADDA_FATHATAN"
    elif diacritic == KASRATAN + SHADDA or diacritic == SHADDA + KASRATAN:
        diacritic = "SHADDA_KASRATAN"
    else:
        diacritic = " "
    return diacritic

In [102]:
# read the file train.txt
training_set = read_data("./Dataset/train.txt")
# filter the data
training_set = filter_data(training_set)
# split the data into lines
training_set = re.split(r"[.،]", training_set)
# remove empty lines
training_set = list(filter(None, training_set))

In [103]:
len(training_set)

116499

In [104]:
original_labels = ['' for _ in range(len(training_set))] 
sentences = [ '' for _ in range(len(training_set))] 

In [105]:
for i in range(len(training_set)):
    line = training_set[i]
    line_without_diacritics = []
    line_labels = []
    for word in line.split():
        if word == "":
            continue
        word_without_diacritics = ""
        for j in range(len(word)):
            if word[j] in DIACRITICS and j != len(word) - 1:
                continue
            elif word[j] in DIACRITICS and j == len(word) - 1:
                line_labels.append(diacritic_to_str(word[j])) #lable of the word
                continue
            word_without_diacritics += word[j]
        line_without_diacritics.append(word_without_diacritics)
    sentences[i] = " ".join(line_without_diacritics) + " ."
    original_labels[i] = " ".join(line_labels)

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(sentences))
with open("./Dataset/labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(original_labels))