In [1]:
import re
import os 
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import random as rnd

In [2]:
def read_data(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        # train_set = f.read().splitlines()
        return f.read()

In [3]:
# filter data takes a list of strings and removes unwanted patterns
def filter_data(data: str) -> str:
    # data = re.sub(r"\( \d+ (/ \d+)? \)", "", data)
    # remove all numbers
    data = re.sub(r"\d+", "", data)
    # regex to remove all special characters
    data = re.sub(r"[][//,;\?؟()$:\-{}_*؛:«»`–\"~!]", "", data)
    # remove all english letters
    data = re.sub(r"[a-zA-Z]", "", data)
    # Substituting multiple spaces with single space
    data = re.sub(r"([^\S\n])+", " ", data, flags=re.I)
    return data

In [4]:
def split_data_to_words(data: str) -> list:
    words = re.split(r"\s+", data)
    return words

In [35]:
# Define Diacritics
KASRA = "\u0650"
DAMMA = "\u064F"
FATHA = "\u064E"
KASRATAN = "\u064D"
DAMMATAN = "\u064C"
FATHATAN = "\u064B"
SUKUN = "\u0652"
SHADDA = "\u0651"
DAMMA_SHADDA =  DAMMA + SHADDA
SHADDA_DAMMA =  SHADDA + DAMMA
FATHA_SHADDA =  FATHA + SHADDA
SHADDA_FATHA =  SHADDA + FATHA
KASRA_SHADDA =  KASRA + SHADDA
SHADDA_KASRA =  SHADDA + KASRA
DAMMATAN_SHADDA =  DAMMATAN + SHADDA
SHADDA_DAMMATAN =  SHADDA + DAMMATAN
FATHATAN_SHADDA =  FATHATAN + SHADDA
SHADDA_FATHATAN =  SHADDA + FATHATAN
KASRATAN_SHADDA =  KASRATAN + SHADDA
SHADDA_KASRATAN =  SHADDA + KASRATAN
EMPTY = ""
DIACRITICS = [KASRA, DAMMA, FATHA, KASRATAN, DAMMATAN, FATHATAN, SUKUN, SHADDA, DAMMA_SHADDA, SHADDA_DAMMA, FATHA_SHADDA, SHADDA_FATHA, KASRA_SHADDA, SHADDA_KASRA, DAMMATAN_SHADDA, SHADDA_DAMMATAN, FATHATAN_SHADDA, SHADDA_FATHATAN, KASRATAN_SHADDA, SHADDA_KASRATAN, EMPTY]
ARABIC_ALPHABIT = "اأآإئءبتةثجحخدذرزسشصضطظعغفقكلمنهوؤيى"

In [36]:
# This function is responsible for mapping diacritics to their corresponding strings
def diacritic_to_str(diacritic):
    if diacritic == SHADDA:
        diacritic = "SHADDA"
    elif diacritic == KASRA:
        diacritic = "KASRA"
    elif diacritic == DAMMA:
        diacritic = "DAMMA"
    elif diacritic == FATHA:
        diacritic = "FATHA"
    elif diacritic == KASRATAN:
        diacritic = "KASRATAN"
    elif diacritic == DAMMATAN:
        diacritic = "DAMMATAN"
    elif diacritic == FATHATAN:
        diacritic = "FATHATAN"
    elif diacritic == SUKUN:
        diacritic = "SUKUN"
    elif diacritic == DAMMA_SHADDA or diacritic == SHADDA_DAMMA :
        diacritic = "SHADDA_DAMMA"
    elif diacritic == FATHA_SHADDA or diacritic == SHADDA_FATHA:
        diacritic = "SHADDA_FATHA"
    elif diacritic == KASRA_SHADDA or diacritic == SHADDA_KASRA:
        diacritic = "SHADDA_KASRA"
    elif diacritic == DAMMATAN_SHADDA or diacritic == SHADDA_DAMMATAN:
        diacritic = "SHADDA_DAMMATAN"
    elif diacritic == FATHATAN_SHADDA or diacritic == SHADDA_FATHATAN:
        diacritic = "SHADDA_FATHATAN"
    elif diacritic == KASRATAN_SHADDA or diacritic == SHADDA_KASRATAN:
        diacritic = "SHADDA_KASRATAN"
    else:
        diacritic = " "
    return diacritic

In [49]:
# read the file train.txt
training_set = read_data("./Dataset/val.txt")
# filter the data
training_set = filter_data(training_set)
# split the data into lines
training_set = re.split(r"[.،]", training_set)
# remove empty lines
training_set = list(filter(None, training_set))

In [50]:
len(training_set)

5919

In [51]:
original_labels = ['' for _ in range(len(training_set))] 
sentences = [ '' for _ in range(len(training_set))] 

In [52]:
for i in range(len(training_set)):
    line = training_set[i]
    line_without_diacritics = []
    line_labels = []
    for word in line.split():
        if word == "":
            continue
        word_without_diacritics = ""
        for j in range(len(word)):
            if j==len(word) - 2 and word[j] in DIACRITICS and word[j+1] in DIACRITICS:
                line_labels.append(diacritic_to_str(word[j] + word[j+1])) #lable of the word
                continue
            if word[j] in DIACRITICS and j != len(word) - 1:
                continue
            if word[j] in DIACRITICS and j == len(word) - 1:
                line_labels.append(diacritic_to_str(word[j])) #lable of the word
                continue
            word_without_diacritics += word[j]
        line_without_diacritics.append(word_without_diacritics)
    sentences[i] = " ".join(line_without_diacritics) + " ."
    original_labels[i] = " ".join(line_labels)

In [53]:
# save the sentences and labels in a text files
with open("./Dataset/v_sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(sentences))
with open("./Dataset/v_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(original_labels))

In [47]:
# save the unqiue words in a text file
unique_words = set()
for line in sentences:
    for word in line.split():
        unique_words.add(word)
with open("./Dataset/unique_words.txt", "a", encoding="utf-8") as f:
    f.write("\n".join(unique_words))

In [48]:
# save the unqiue labels in a text file
unique_labels = set()
for line in original_labels:
    for label in line.split():
        unique_labels.add(label)
with open("./Dataset/unique_labels.txt", "a", encoding="utf-8") as f:
    f.write("\n".join(unique_labels))