In [1]:
import re
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import random as rnd

In [None]:

# from google.colab import drive
# drive.mount('/content/drive')
# %cd ./drive/MyDrive/Colab\ Notebooks/NLP_Project/

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1zPjf1cHfdKqObemkPReffGbQHU_wotr2/NLP_Project


In [None]:
import pickle

# Specify the path to your .pickle file
pickle_file_path = './Pickle files/arabic_letters.pickle'

# Open the file in binary mode and read the content
with open(pickle_file_path, 'rb') as file:
    # Load the content from the pickle file
    data = pickle.load(file)

# Print the content
data = np.array(list(data))
data = np.append(data, 'UNK')
data = np.append(data, '<pad>')
arabic_letters = data
print(data)
print(len(data))

['ي' 'ن' 'ز' 'ض' 'ظ' 'س' 'ق' 'ء' 'ئ' 'خ' 'ح' 'ر' 'ت' 'ف' 'و' 'غ' 'ل' 'ه'
 'ب' 'د' 'إ' 'م' 'ج' 'ش' 'ع' 'ا' 'ك' 'ؤ' 'ص' 'ى' 'ذ' 'ث' 'آ' 'ط' 'أ' 'ة'
 'UNK' '<pad>']
38


In [2]:
def read_data(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        # train_set = f.read().splitlines()
        return f.read()

In [3]:
# filter data takes a list of strings and removes unwanted patterns
def filter_data(data: str) -> str:
    # data = re.sub(r"\( \d+ (/ \d+)? \)", "", data)
    # remove all numbers
    data = re.sub(r"\d+", "", data)
    # regex to remove all special characters
    data = re.sub(r"[][//,;\?؟()$:\-{}_*؛:«»`–\"~!']", "", data)
    # remove all english letters
    data = re.sub(r"[a-zA-Z]", "", data)
    # Substituting multiple spaces with single space
    data = re.sub(r"([^\S\n])+", " ", data, flags=re.I)
    return data

In [4]:
def split_data_to_words(data: str) -> list:
    words = re.split(r"\s+", data)
    return words

In [5]:
# Define Diacritics
KASRA = "\u0650"
DAMMA = "\u064F"
FATHA = "\u064E"
KASRATAN = "\u064D"
DAMMATAN = "\u064C"
FATHATAN = "\u064B"
SUKUN = "\u0652"
SHADDA = "\u0651"
DAMMA_SHADDA = DAMMA + SHADDA
SHADDA_DAMMA = SHADDA + DAMMA
FATHA_SHADDA = FATHA + SHADDA
SHADDA_FATHA = SHADDA + FATHA
KASRA_SHADDA = KASRA + SHADDA
SHADDA_KASRA = SHADDA + KASRA
DAMMATAN_SHADDA = DAMMATAN + SHADDA
SHADDA_DAMMATAN = SHADDA + DAMMATAN
FATHATAN_SHADDA = FATHATAN + SHADDA
SHADDA_FATHATAN = SHADDA + FATHATAN
KASRATAN_SHADDA = KASRATAN + SHADDA
SHADDA_KASRATAN = SHADDA + KASRATAN
EMPTY = "_"
PAD_LABLE = "pad"
DIACRITICS = [
    KASRA,
    DAMMA,
    FATHA,
    KASRATAN,
    DAMMATAN,
    FATHATAN,
    SUKUN,
    SHADDA,
    DAMMA_SHADDA,
    SHADDA_DAMMA,
    FATHA_SHADDA,
    SHADDA_FATHA,
    KASRA_SHADDA,
    SHADDA_KASRA,
    DAMMATAN_SHADDA,
    SHADDA_DAMMATAN,
    FATHATAN_SHADDA,
    SHADDA_FATHATAN,
    KASRATAN_SHADDA,
    SHADDA_KASRATAN,
    EMPTY,
    PAD_LABLE,
]
ARABIC_ALPHABIT = "اأآإئءبتةثجحخدذرزسشصضطظعغفقكلمنهوؤيى"

In [6]:
# This function is responsible for mapping diacritics to their corresponding strings
def diacritic_to_str(diacritic):
    if diacritic == SHADDA:
        diacritic = "SHADDA"
    elif diacritic == KASRA:
        diacritic = "KASRA"
    elif diacritic == DAMMA:
        diacritic = "DAMMA"
    elif diacritic == FATHA:
        diacritic = "FATHA"
    elif diacritic == KASRATAN:
        diacritic = "KASRATAN"
    elif diacritic == DAMMATAN:
        diacritic = "DAMMATAN"
    elif diacritic == FATHATAN:
        diacritic = "FATHATAN"
    elif diacritic == SUKUN:
        diacritic = "SUKUN"
    elif diacritic == DAMMA_SHADDA or diacritic == SHADDA_DAMMA:
        diacritic = "SHADDA_DAMMA"
    elif diacritic == FATHA_SHADDA or diacritic == SHADDA_FATHA:
        diacritic = "SHADDA_FATHA"
    elif diacritic == KASRA_SHADDA or diacritic == SHADDA_KASRA:
        diacritic = "SHADDA_KASRA"
    elif diacritic == DAMMATAN_SHADDA or diacritic == SHADDA_DAMMATAN:
        diacritic = "SHADDA_DAMMATAN"
    elif diacritic == FATHATAN_SHADDA or diacritic == SHADDA_FATHATAN:
        diacritic = "SHADDA_FATHATAN"
    elif diacritic == KASRATAN_SHADDA or diacritic == SHADDA_KASRATAN:
        diacritic = "SHADDA_KASRATAN"
    elif diacritic=="pad":
      diacritic = "pad"
    else:  # EMPTY
        diacritic = "_"
    return diacritic

In [7]:
# read the file train.txt
training_set = read_data("./Dataset/train.txt")
# filter the data
training_set = filter_data(training_set)
# split the data into lines
training_set = re.split(r"[.،]", training_set)
# remove empty lines
training_set = list(filter(None, training_set))

In [8]:
# read the file val.txt
validation_set = read_data("./Dataset/val.txt")
# filter the data
validation_set = filter_data(validation_set)
# split the data into lines
validation_set = re.split(r"[.،]", validation_set)
# remove empty lines
validation_set = list(filter(None, validation_set))

In [10]:
# read the file test.txt
test_set = read_data("./Dataset/test_no_diacritics_3.txt")
# test_set = read_data("./Dataset/test.txt")
# test_set = read_data("./Dataset/test2.txt")
# filter the data
test_set = filter_data(test_set)
# split the data into lines
test_set = re.split(r"[.،]", test_set)
# remove empty lines
test_set = list(filter(None, test_set))

In [11]:
len(training_set)

116499

In [12]:
def get_data_words(data_set):
    original_labels = ["" for _ in range(len(data_set))]
    sentences = ["" for _ in range(len(data_set))]
    for i in range(len(data_set)):
        line = data_set[i]
        line_without_diacritics = []
        line_labels = []
        for word in line.split():
            if word == "":
                continue
            word_without_diacritics = ""
            for j in range(len(word)):
                if (
                    j == len(word) - 2
                    and word[j] in DIACRITICS
                    and word[j + 1] in DIACRITICS
                ):
                    line_labels.append(
                        diacritic_to_str(word[j] + word[j + 1])
                    )  # lable of the word
                    break
                if word[j] in DIACRITICS and j != len(word) - 1:
                    continue
                if j == len(word) - 1:
                    line_labels.append(diacritic_to_str(word[j]))  # lable of the word
                    if word[j] in DIACRITICS:
                        continue
                word_without_diacritics += word[j]
            # if i==0:
            #     print(word)
            #     print(word_without_diacritics)

            line_without_diacritics.append(word_without_diacritics)
        sentences[i] = " ".join(line_without_diacritics)
        original_labels[i] = " ".join(line_labels)
        # if i==0:
        #     print(line_without_diacritics)
        #     print(line_labels)
        #     print(len(line_without_diacritics)==len(line_labels))
        #     print(sentences[i].split())
        #     print(original_labels[i].split())
    sentences = list(filter(None, sentences))
    original_labels = list(filter(None, original_labels))
    return sentences, original_labels

In [13]:
def get_data_chars(data_set):
    original_labels = ["" for _ in range(len(data_set))]
    sentences = ["" for _ in range(len(data_set))]
    for i in range(len(data_set)):
        line = data_set[i]
        line_without_diacritics = []
        line_labels = []
        for word in line.split():
            if word == "":
                continue
            j = 0
            while j < len(word):
                if (
                    j <= len(word) - 2
                    and word[j] in DIACRITICS
                    and word[j + 1] in DIACRITICS
                ):
                    line_labels.pop()
                    line_labels.append(
                        diacritic_to_str(word[j] + word[j + 1])
                    )  # lable of the word
                    j += 1
                else:
                    if word[j] in DIACRITICS:
                        # pop
                        line_labels.pop()
                        line_labels.append(diacritic_to_str(word[j]))  # lable of the word
                    else:
                        line_labels.append(diacritic_to_str(word[j]))  # lable of the word
                        line_without_diacritics.append(word[j])

                j += 1

        sentences[i] = " ".join(line_without_diacritics)
        original_labels[i] = " ".join(line_labels)
        # if i == 0:
            #     print(line_without_diacritics)
            #     print(line_labels)
            # print(len(line_without_diacritics) == len(line_labels))
        #     print(sentences[i].split())
        #     print(original_labels[i].split())
    sentences = list(filter(None, sentences))
    original_labels = list(filter(None, original_labels))

    return sentences, original_labels

In [14]:
t_sentences, t_labels = get_data_words(training_set)

In [15]:
t_chars, t_labels_chars = get_data_chars(training_set)

In [16]:
v_sentences, v_labels = get_data_words(validation_set)

In [17]:
v_chars, v_labels_chars = get_data_chars(validation_set)

In [18]:
test_sentences, test_labels = get_data_words(test_set)

In [19]:
test_chars, test_labels_chars = get_data_chars(test_set)

In [20]:
len(test_chars) == len(test_labels_chars)

True

In [None]:
for l in test_chars:
  if l==" " or l=="" or l=="\n":
    print("empty")

In [None]:
number_of_chars = 0
for l in test_chars:
  number_of_chars +=len(l.split())


In [None]:
number_of_chars

32033

In [None]:
"\n".join(test_chars).split("\n") == test_chars

True

In [21]:
# save the sentences and labels in a text files
with open("./Dataset/new_new_characters/t_chars.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_chars))
with open("./Dataset/new_new_characters/t_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_labels_chars))

In [22]:
# save the sentences and labels in a text files
with open("./Dataset/new_new_characters/v_chars.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_chars))
with open("./Dataset/new_new_characters/v_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_labels_chars))

In [23]:
# save the sentences and labels in a text files
with open("./Dataset/new_new_characters/test_chars_3.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_chars))
with open("./Dataset/new_new_characters/test_labels_3.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_labels_chars))

In [None]:
# save the unqiue words in a text file
unique_chars = arabic_letters
# unique_chars = []
# for line in t_chars + v_chars:
#     for c in line.split():
#         if c not in unique_chars:
#           unique_chars.append(c)
print(len(unique_chars))
with open("./Dataset/new_characters/unique_chars.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(unique_chars))
    # f.write("\n")
    # f.write("UNK")
    # f.write("\n")
    # f.write("<pad>")

38


In [None]:
# Specify the path to your .pickle file
pickle_file_path = './Pickle files/diacritic2id.pickle'

# Open the file in binary mode and read the content
with open(pickle_file_path, 'rb') as file:
    # Load the content from the pickle file
    data = pickle.load(file)
print(data)
print(len(data))

{'َ': 0, 'ً': 1, 'ُ': 2, 'ٌ': 3, 'ِ': 4, 'ٍ': 5, 'ْ': 6, 'ّ': 7, 'َّ': 8, 'ًّ': 9, 'ُّ': 10, 'ٌّ': 11, 'ِّ': 12, 'ٍّ': 13, '': 14}
15


In [None]:
DIACRITICS_LIST_OF_STRINGS = [diacritic_to_str(diacritic) for diacritic in data.keys()]
DIACRITICS_LIST_OF_STRINGS.append("pad")
DIACRITICS_LIST_OF_STRINGS

['FATHA',
 'FATHATAN',
 'DAMMA',
 'DAMMATAN',
 'KASRA',
 'KASRATAN',
 'SUKUN',
 'SHADDA',
 'SHADDA_FATHA',
 'SHADDA_FATHATAN',
 'SHADDA_DAMMA',
 'SHADDA_DAMMATAN',
 'SHADDA_KASRA',
 'SHADDA_KASRATAN',
 '_',
 'pad']

In [None]:
len(DIACRITICS_LIST_OF_STRINGS)

16

In [None]:
with open("./Dataset/new_new_characters/unique_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(DIACRITICS_LIST_OF_STRINGS))

In [None]:
for k,v in data.items():
  print(diacritic_to_str(k),":",v)

FATHA : 0
FATHATAN : 1
DAMMA : 2
DAMMATAN : 3
KASRA : 4
KASRATAN : 5
SUKUN : 6
SHADDA : 7
SHADDA_FATHA : 8
SHADDA_FATHATAN : 9
SHADDA_DAMMA : 10
SHADDA_DAMMATAN : 11
SHADDA_KASRA : 12
SHADDA_KASRATAN : 13
_ : 14


In [None]:
# save the sentences and labels in a text files
with open("./Dataset/t_sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_sentences))
with open("./Dataset/t_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(t_labels))

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/v_sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_sentences))
with open("./Dataset/v_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(v_labels))

In [None]:
# save the sentences and labels in a text files
with open("./Dataset/test_sentences.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_sentences))
with open("./Dataset/test_labels.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_labels))

In [None]:
# save the unqiue labels in a text file
# 34an a3rf lo fe label naseh wla la2
unique_labels = []
for line in t_labels:
    for label in line.split():
        if label not in DIACRITICS_LIST_OF_STRINGS:
            unique_labels.append(label)
with open("./Dataset/characters/unique_labels.txt", "a", encoding="utf-8") as f:
    f.write("\n".join(unique_labels))

In [None]:
# save the unqiue words in a text file
unique_words = []
for line in t_sentences + v_sentences:
    for word in line.split():
        if word not in unique_words:
            unique_words.append(word)

with open("./Dataset/unique_words.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(unique_words))
    f.write("\n")
    f.write("UNK")
    f.write("\n")
    f.write("<pad>")