In [1]:
import re
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import random as rnd

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd ./drive/MyDrive/Colab\ Notebooks/NLP_Project/

In [3]:
def read_data(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        # train_set = f.read().splitlines()
        return f.read()

In [4]:
# filter data takes a list of strings and removes unwanted patterns
def filter_data(data: str) -> str:
    # data = re.sub(r"\( \d+ (/ \d+)? \)", "", data)
    # remove all numbers
    data = re.sub(r"\d+", "", data)
    # regex to remove all special characters
    data = re.sub(r"[][//,;\?؟()$:\-{}_*؛:«»`–\"~!]", "", data)
    # remove all english letters
    data = re.sub(r"[a-zA-Z]", "", data)
    # Substituting multiple spaces with single space
    data = re.sub(r"([^\S\n])+", " ", data, flags=re.I)
    return data

In [5]:
def split_data_to_words(data: str) -> list:
    words = re.split(r"\s+", data)
    return words

In [30]:
# Define Diacritics
KASRA = "\u0650"
DAMMA = "\u064F"
FATHA = "\u064E"
KASRATAN = "\u064D"
DAMMATAN = "\u064C"
FATHATAN = "\u064B"
SUKUN = "\u0652"
SHADDA = "\u0651"
DAMMA_SHADDA = DAMMA + SHADDA
SHADDA_DAMMA = SHADDA + DAMMA
FATHA_SHADDA = FATHA + SHADDA
SHADDA_FATHA = SHADDA + FATHA
KASRA_SHADDA = KASRA + SHADDA
SHADDA_KASRA = SHADDA + KASRA
DAMMATAN_SHADDA = DAMMATAN + SHADDA
SHADDA_DAMMATAN = SHADDA + DAMMATAN
FATHATAN_SHADDA = FATHATAN + SHADDA
SHADDA_FATHATAN = SHADDA + FATHATAN
KASRATAN_SHADDA = KASRATAN + SHADDA
SHADDA_KASRATAN = SHADDA + KASRATAN
EMPTY = "_"
DIACRITICS = [
    KASRA,
    DAMMA,
    FATHA,
    KASRATAN,
    DAMMATAN,
    FATHATAN,
    SUKUN,
    SHADDA,
    DAMMA_SHADDA,
    SHADDA_DAMMA,
    FATHA_SHADDA,
    SHADDA_FATHA,
    KASRA_SHADDA,
    SHADDA_KASRA,
    DAMMATAN_SHADDA,
    SHADDA_DAMMATAN,
    FATHATAN_SHADDA,
    SHADDA_FATHATAN,
    KASRATAN_SHADDA,
    SHADDA_KASRATAN,
    EMPTY,
]
ARABIC_ALPHABIT = "اأآإئءبتةثجحخدذرزسشصضطظعغفقكلمنهوؤيى"

In [7]:
# This function is responsible for mapping diacritics to their corresponding strings
def diacritic_to_str(diacritic):
    if diacritic == SHADDA:
        diacritic = "SHADDA"
    elif diacritic == KASRA:
        diacritic = "KASRA"
    elif diacritic == DAMMA:
        diacritic = "DAMMA"
    elif diacritic == FATHA:
        diacritic = "FATHA"
    elif diacritic == KASRATAN:
        diacritic = "KASRATAN"
    elif diacritic == DAMMATAN:
        diacritic = "DAMMATAN"
    elif diacritic == FATHATAN:
        diacritic = "FATHATAN"
    elif diacritic == SUKUN:
        diacritic = "SUKUN"
    elif diacritic == DAMMA_SHADDA or diacritic == SHADDA_DAMMA:
        diacritic = "SHADDA_DAMMA"
    elif diacritic == FATHA_SHADDA or diacritic == SHADDA_FATHA:
        diacritic = "SHADDA_FATHA"
    elif diacritic == KASRA_SHADDA or diacritic == SHADDA_KASRA:
        diacritic = "SHADDA_KASRA"
    elif diacritic == DAMMATAN_SHADDA or diacritic == SHADDA_DAMMATAN:
        diacritic = "SHADDA_DAMMATAN"
    elif diacritic == FATHATAN_SHADDA or diacritic == SHADDA_FATHATAN:
        diacritic = "SHADDA_FATHATAN"
    elif diacritic == KASRATAN_SHADDA or diacritic == SHADDA_KASRATAN:
        diacritic = "SHADDA_KASRATAN"
    else:  # EMPTY
        diacritic = "_"
    return diacritic

In [8]:
# read the file train.txt
training_set = read_data("./Dataset/train.txt")
# filter the data
training_set = filter_data(training_set)
# split the data into lines
training_set = re.split(r"[.،]", training_set)
# remove empty lines
training_set = list(filter(None, training_set))

In [9]:
# read the file val.txt
validation_set = read_data("./Dataset/val.txt")
# filter the data
validation_set = filter_data(validation_set)
# split the data into lines
validation_set = re.split(r"[.،]", validation_set)
# remove empty lines
validation_set = list(filter(None, validation_set))

In [10]:
# read the file test.txt
test_set = read_data("./Dataset/test.txt")
# filter the data
test_set = filter_data(test_set)
# split the data into lines
test_set = re.split(r"[.،]", test_set)
# remove empty lines
test_set = list(filter(None, test_set))

In [11]:
len(training_set)

116499

In [12]:
def get_data_words(data_set):
    original_labels = ["" for _ in range(len(data_set))]
    sentences = ["" for _ in range(len(data_set))]
    for i in range(len(data_set)):
        line = data_set[i]
        line_without_diacritics = []
        line_labels = []
        for word in line.split():
            if word == "":
                continue
            word_without_diacritics = ""
            for j in range(len(word)):
                if (
                    j == len(word) - 2
                    and word[j] in DIACRITICS
                    and word[j + 1] in DIACRITICS
                ):
                    line_labels.append(
                        diacritic_to_str(word[j] + word[j + 1])
                    )  # lable of the word
                    break
                if word[j] in DIACRITICS and j != len(word) - 1:
                    continue
                if j == len(word) - 1:
                    line_labels.append(diacritic_to_str(word[j]))  # lable of the word
                    if word[j] in DIACRITICS:
                        continue
                word_without_diacritics += word[j]
            # if i==0:
            #     print(word)
            #     print(word_without_diacritics)

            line_without_diacritics.append(word_without_diacritics)
        sentences[i] = " ".join(line_without_diacritics)
        original_labels[i] = " ".join(line_labels)
        # if i==0:
        #     print(line_without_diacritics)
        #     print(line_labels)
        #     print(len(line_without_diacritics)==len(line_labels))
        #     print(sentences[i].split())
        #     print(original_labels[i].split())
    return sentences, original_labels

In [13]:
def get_data_chars(data_set):
    original_labels = ["" for _ in range(len(data_set))]
    sentences = ["" for _ in range(len(data_set))]
    for i in range(len(data_set)):
        line = data_set[i]
        line_without_diacritics = []
        line_labels = []
        for word in line.split():
            if word == "":
                continue
            j = 0
            while j < len(word):
                if (
                    j <= len(word) - 2
                    and word[j] in DIACRITICS
                    and word[j + 1] in DIACRITICS
                ):
                    line_labels.pop()
                    line_labels.append(
                        diacritic_to_str(word[j] + word[j + 1])
                    )  # lable of the word
                    j += 1
                else:
                    if word[j] in DIACRITICS:
                        # pop
                        line_labels.pop()
                        line_labels.append(diacritic_to_str(word[j]))  # lable of the word
                    else:
                        line_labels.append(diacritic_to_str(word[j]))  # lable of the word
                        line_without_diacritics.append(word[j])

                j += 1

        sentences[i] = " ".join(line_without_diacritics)
        original_labels[i] = " ".join(line_labels)
        # if i == 0:
            #     print(line_without_diacritics)
            #     print(line_labels)
            # print(len(line_without_diacritics) == len(line_labels))
        #     print(sentences[i].split())
        #     print(original_labels[i].split())
    return sentences, original_labels

In [14]:
t_sentences, t_labels = get_data_words(training_set)

In [15]:
t_chars, t_labels_chars = get_data_chars(training_set)

In [16]:
v_sentences, v_labels = get_data_words(validation_set)

In [17]:
v_chars, v_labels_chars = get_data_chars(validation_set)

In [18]:
test_sentences, test_labels = get_data_words(test_set)

In [19]:
test_chars, test_labels_chars = get_data_chars(test_set)

In [22]:
# save the sentences and labels in a text files
with open("./Dataset/characters/t_chars.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_chars))
with open("./Dataset/characters/t_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_labels_chars))

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/characters/v_chars.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_chars))
with open("./Dataset/characters/v_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_labels_chars))

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/characters/test_chars.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_chars))
with open("./Dataset/characters/test_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_labels_chars))

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/t_sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_sentences))
with open("./Dataset/t_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_labels))

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/v_sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_sentences))
with open("./Dataset/v_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_labels))

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/test_sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_sentences))
with open("./Dataset/test_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_labels))

In [None]:
# save the unqiue words in a text file
unique_words = set()
for line in t_sentences + v_sentences:
    for word in line.split():
        unique_words.add(word)

with open("./Dataset/unique_words.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(unique_words))
    f.write("\n")
    f.write("UNK")
    f.write("\n")
    f.write("<pad>")

In [23]:
# save the unqiue words in a text file
unique_words = set()
for line in t_sentences + v_sentences:
    for word in line.split():
        unique_words.add(word)

with open("./Dataset/characters/unique_chars.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(unique_words))
    f.write("\n")
    f.write("UNK")
    f.write("\n")
    f.write("<pad>")

In [32]:
DIACRITICS_LIST_OF_STRINGS = [diacritic_to_str(diacritic) for diacritic in DIACRITICS]

In [33]:
with open("./Dataset/characters/unique_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(set(DIACRITICS_LIST_OF_STRINGS)))

In [None]:
with open("./Dataset/unique_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(set(DIACRITICS_LIST_OF_STRINGS)))

In [29]:
# save the unqiue labels in a text file
# 34an a3rf lo fe label naseh wla la2
unique_labels = set()
for line in t_labels:
    for label in line.split():
        if label not in DIACRITICS_LIST_OF_STRINGS:
            unique_labels.add(label)
with open("./Dataset/characters/unique_labels.txt", "a", encoding="utf-8") as f:
    f.write("\n".join(unique_labels))

In [47]:
l = [1,2,3]
l.pop()
l.append(4)
l.pop()

4