# 1- Setup Project

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/ATD-WSD

# Create dir to for storing trained model
#!mkdir Baseline-w2v

/content/drive/MyDrive/ATD-WSD


In [3]:
!pip install numpy==1.23.1
!pip install tensorflow==2.14.0

Collecting numpy==1.23.1
  Using cached numpy-1.23.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Using cached numpy-1.23.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.3
    Uninstalling numpy-2.1.3:
      Successfully uninstalled numpy-2.1.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albucore 0.0.19 requires numpy>=1.24.4, but you have numpy 1.23.1 which is incompatible.
albumentations 1.4.20 requires numpy>=1.24.4, but you have numpy 1.23.1 which is incompatible.
bigframes 1.27.0 requires numpy>=1.24.0, but you have numpy 1.23.1 which is incompatible.
chex 0.1.87 requires numpy>=1.24.1, but you have numpy 1.23.1 which is incompatible.
ibis-framework 9.2.0 requires numpy<3,>=1.23.2, but

Collecting numpy>=1.23.5 (from tensorflow==2.14.0)
  Using cached numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, arg

In [1]:
import re
import pickle
import json
import numpy as np
import time
import random
import joblib

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model
from tensorflow.keras.layers import Input
from keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional, TimeDistributed, InputLayer
from tensorflow.keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import Sequence
from keras.initializers import glorot_normal
from keras.callbacks import ModelCheckpoint
from rich import print_json

In [3]:
if tf.config.list_physical_devices('GPU'):
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("GPU device not found: working on CPU")

GPU device not found: working on CPU


# 2- Importing Dataset
The dataset that was prepared using Gemini which is 20% of the train, 100% val and testing taken from [original dataset](https://github.com/AliOsm/arabic-text-diacritization/tree/master/dataset)
```
[{
    "sentence": "some text in arabic",
    "words": [
      {
        "word": "word_1",
        "word_sense": "definition_1"
        "pos" : "part_of_speech_1"
      }
    ]
}]
```

***⚠️This baseline model will use the sentences without the sense***

In [179]:
# Helpers
def read_json(file_path):
  with open(file_path, mode="r", encoding="utf-8") as json_data:
    return json.load(json_data)

def get_sentences(data):
  sentences = []
  for s in data:
    sentences.append(s['sentence'])
  return sentences

def pprint(json_data):
  print_json(data=json_data, highlight=False)

In [181]:
train_data = read_json("/content/10485_train_wsd.json")
val_data = read_json("/content/2517_val_wsd.json")
train_data = train_data[:-5485]
# val_data = val_data[:-2100]
print('Training data length:', len(train_data))
print("Train Sample")
pprint(train_data[100])

print('Validation data length:', len(val_data))
print("Val Sample")
pprint(val_data[1])

Training data length: 5000
Train Sample


Validation data length: 2517
Val Sample


# 3- Constants

In [182]:
# Helpers
def CHAR_IDX(LIST):
    char2idx = {}
    idx2char = {}

    for i, char in enumerate(LIST):
        char2idx[char] = i
        idx2char[i] = char

    return char2idx, idx2char

In [183]:
ARABIC_CHAR = "ىعظحرسيشضقثلصطكآماإهزءأفؤغجئدةخوبذتن"
NUMBERS = "0123456789٠١٢٣٤٥٦٧٨٩"

# 15 possible Diacritics
FATHATAN = u'\u064b'
DAMMATAN = u'\u064c'
KASRATAN = u'\u064d'
FATHA = u'\u064e'
DAMMA = u'\u064f'
KASRA = u'\u0650'
SHADDA = u'\u0651'
SUKUN = u'\u0652'

DIACRITICS = [
    "",              # No Diacritic
    FATHA,           # Fatha
    FATHATAN,        # Fathatah
    DAMMA,           # Damma
    DAMMATAN,        # Dammatan
    KASRA,           # Kasra
    KASRATAN,        # Kasratan
    SUKUN,           # Sukun
    SHADDA,          # Shadda
    SHADDA+FATHA,    # Shadda + Fatha
    SHADDA+FATHATAN, # Shadda + Fathatah
    SHADDA+DAMMA,    # Shadda + Damma
    SHADDA+DAMMATAN, # Shadda + Dammatan
    SHADDA+KASRA,    # Shadda + Kasra
    SHADDA+KASRATAN  # Shadda + Kasratan
]

PUNCTUATIONS = [
    ".",    "،",    ":",    "؛",
    "-",    "–",    "«",    "»",
    "~",    "؟",    "!",    "*",
    "(",    ")",    "[",    "]",
    "{",    "}",    ";",    "\n",
    "'",    "\"",   "`",    "/",
    ",",    "?",    '’',    '“',
    '…',    '﴾',    '﴿',    "+",
    "*",    "=",    "&",    "_",
    "\n",   "\u200d",       "\u200f"
]


# Special Tokens
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
SOSS_TOKEN = "<sense>"
EOSS_TOKEN = "</sense>"
SOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
SPECIAL_TOKENS = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, SOSS_TOKEN, EOSS_TOKEN]

# Combine
ARABIC_CHAR_SPACE = list(ARABIC_CHAR) + [' ']
ARABIC_CHAR_VALID = ARABIC_CHAR_SPACE + DIACRITICS
ALLCHARS = ARABIC_CHAR_SPACE + list(NUMBERS) + PUNCTUATIONS + SPECIAL_TOKENS

In [184]:
char_mapping, reverse_char_mapping = CHAR_IDX(ALLCHARS)
class_mapping, reverse_class_mapping = CHAR_IDX(DIACRITICS)

print("Char Mapping Size:", len(char_mapping))
print("Class Mapping Size:", len(class_mapping))

Char Mapping Size: 99
Class Mapping Size: 15


# 4- Helper Functions

In [188]:
def remove_diacritics_line(data):
    return data.translate(str.maketrans('', '', ''.join(DIACRITICS)))

def get_max_size(data):
  return max(len(remove_diacritics_line(item['sentence'].strip())) for item in data)

def get_min_size(data):
  return min(len(remove_diacritics_line(item['sentence'].strip())) for item in data)

def one_hot_matrix(data, size):
    one_hot_matrix = [[1 if j == i else 0 for j in range(size)] for i in data]
    return one_hot_matrix

def one_hot_vector(index , size):
    one_hot_vector = [1 if j == index else 0 for j in range(size)]
    return one_hot_vector

def get_words(l, line):
  last_word = ""
  first_word = ""
  list_of_words = []

  for i, w in enumerate(l.split()):
    if i == 0:
      first_word = w
    if w not in PUNCTUATIONS:
      last_word = w
  found_first = False
  if "words" not in line:
    return {"sentence": l, "words": []}
  for j, w in enumerate(line['words']):
    if "word" not in w:
      continue
    if w['word'] == first_word:
      found_first = True

    if found_first:
      list_of_words.append(w)
      if w['word'] == last_word:
        dic = {"sentence": l, "words": list_of_words}
        return dic
  dic = {"sentence": l, "words": list_of_words}
  return dic

def split_at(line, at='\n'):
  new_data = []
  for l in line['sentence'].split(at):
    new_data.append(get_words(l, line))
  return new_data

def punc_split(data):
  new_data = []
  for line in data:
    line['sentence'] = line['sentence'].replace('.', '.\n')
    line['sentence'] = line['sentence'].replace(',', ',\n')
    line['sentence'] = line['sentence'].replace('،', '،\n')
    line['sentence'] = line['sentence'].replace(':', ':\n')
    line['sentence'] = line['sentence'].replace(';', ';\n')
    line['sentence'] = line['sentence'].replace('؛', '؛\n')
    line['sentence'] = line['sentence'].replace('(', '\n\(')
    line['sentence'] = line['sentence'].replace(')', '\)\n')
    line['sentence'] = line['sentence'].replace('[', '\n[')
    line['sentence'] = line['sentence'].replace(']', ']\n')
    line['sentence'] = line['sentence'].replace('{', '\n{')
    line['sentence'] = line['sentence'].replace('}', '}\n')
    line['sentence'] = line['sentence'].replace('«', '\n«')
    line['sentence'] = line['sentence'].replace('»', '»\n')
    line['sentence'] = line['sentence'].replace('؟', '؟\n')
    line['sentence'] = line['sentence'].replace('?', '?\n')
    line['sentence'] = line['sentence'].replace('!', '!\n')
    line['sentence'] = line['sentence'].replace('-', '-\n')
    for l in line['sentence'].split('\n'):
      a = get_words(l, line)
      if a and len(a) > 0:
        new_data.append(a)
  return new_data

def split_on_length(sentence_data, max_len=200):
    split_sentences = []

    for sentence in punc_split(sentence_data):
       new_sentence = remove_diacritics_line(sentence['sentence'].strip())

       if len(new_sentence) != 0:
          if len(new_sentence) > 0 and len(new_sentence) <= max_len:
                  #sentence['sentence'] = sentence['sentence'].strip()
                  #split_sentences.append(sentence)
                  dic = {"sentence": sentence['sentence'].strip(), "words": sentence['words']}
                  split_sentences.append(dic)
          else:
            sentence_words = sentence['sentence'].split()
            temp_sentence = ''

            for word in sentence_words:
              if len(remove_diacritics_line(temp_sentence).strip()) + len(remove_diacritics_line(word).strip()) + 1 > max_len:
                  if len(remove_diacritics_line(temp_sentence).strip()) > 0:
                      a = re.sub(temp_sentence, f'{temp_sentence}\\n', sentence['sentence'])
                      dic = {"sentence": a.strip(), "words": sentence['words']}
                      n = split_at(dic)
                      for i in n:
                        if len(i['sentence']) <= max_len:
                          i['sentence'] = i['sentence'].strip()
                          split_sentences.append(i)
                  temp_sentence = word
              else:
                  temp_sentence = word if temp_sentence == '' else temp_sentence + ' ' + word

            if len(remove_diacritics_line(temp_sentence).strip()) > 0:
                  a = re.sub(temp_sentence, f'{temp_sentence}\\n', sentence['sentence'])
                  dic = {"sentence": a.strip(), "words": sentence['words']}

                  #sentence['sentence'] = re.sub(temp_sentence, f'{temp_sentence}\\n', sentence['sentence'])
                  n = split_at(dic)
                  for i in n:
                    if len(i['sentence']) <= max_len:
                      i['sentence'] = i['sentence'].strip()
                      split_sentences.append(i)

    return split_sentences
def get_all_senses(docs):
    sense = []
    for doc in docs:
      for word in doc['words']:
        if 'sense' in word:
          sense.append(word['sense'])
    return sense

def train_word_embeddings(docs):
    doc = get_sentences(docs)
    words = get_all_senses(docs)
    doc.extend(words)
    tokenizer = Tokenizer(oov_token='<OOV>')
    tokenizer.fit_on_texts(doc)

    sentences = [dc.split() for dc in doc ]
    sentences.append([UNK_TOKEN])
    word2vec_model = Word2Vec(sentences, vector_size = 100, window=5, min_count=1, workers=4)

    word_embeddings = word2vec_model.wv

    return word_embeddings, tokenizer

def get_word_embeddings(word):
    encoded_docs = tokenizer.texts_to_sequences(word)
    word_embeddings_for_sample = []
    for word_index in encoded_docs:
      if len(word_index) > 0:
        if word_index[0] in data_embeddings:
          word_embeddings_for_sample.append(data_embeddings[word_index[0]])
    return word_embeddings_for_sample

# 5- Prepare Data

In [189]:
split_length_train_data      = split_on_length(train_data)
split_length_val_data      = split_on_length(val_data)

print("Train Data Size:", len(split_length_train_data))
print('Training data max:', get_max_size(split_length_train_data))
print('Training data min:', get_min_size(split_length_train_data))
print("Train Sample:", split_length_train_data[0:1])
print()

print("Val Data Size:", len(split_length_val_data))
print('Validation data max:', get_max_size(split_length_val_data))
print('Validation data min:', get_min_size(split_length_val_data))
# print("Val Sample:", split_length_val_data[0:2])

Train Data Size: 17103
Training data max: 200
Training data min: 1
Train Sample: [{'sentence': 'وَلَوْ جَمَعَ ثُمَّ عَلِمَ تَرْكَ رُكْنٍ مِنْ الْأُولَى بَطَلَتَا وَيُعِيدُهُمَا جَامِعًا ،', 'words': [{'pos': 'conjunction', 'sense': 'even if', 'word': 'وَلَوْ'}, {'pos': 'verb', 'sense': 'to gather', 'word': 'جَمَعَ'}, {'pos': 'adverb', 'sense': 'then', 'word': 'ثُمَّ'}, {'pos': 'verb', 'sense': 'to know', 'word': 'عَلِمَ'}, {'pos': 'noun', 'sense': 'leaving', 'word': 'تَرْكَ'}, {'pos': 'noun', 'sense': 'pillar', 'word': 'رُكْنٍ'}, {'pos': 'preposition', 'sense': 'from', 'word': 'مِنْ'}, {'pos': 'adjective', 'sense': 'first', 'word': 'الْأُولَى'}, {'pos': 'verb', 'sense': 'to become invalid', 'word': 'بَطَلَتَا'}, {'pos': 'verb', 'sense': 'to return', 'word': 'وَيُعِيدُهُمَا'}, {'pos': 'adjective', 'sense': 'gathering', 'word': 'جَامِعًا'}]}]

Val Data Size: 15413
Validation data max: 200
Validation data min: 1


#### Data without diacritics and without punc and numbers

In [190]:
def clean_data(data, remove_dia=False):
  cleaned_data = []
  for text in data:
    temp = {'sentence': "", 'words':[]}
    sen = text['sentence']
    cleaned = ''.join(char for char in sen if char in ARABIC_CHAR_VALID)
    cleaned = cleaned.strip()
    if cleaned != "":
      if remove_dia:
        cleaned = remove_diacritics_line(cleaned)

      temp['sentence'] = cleaned
      temp['words'] = text['words']
      cleaned_data.append(temp)
  return cleaned_data


clean_diac_train_data = clean_data(split_length_train_data)
clean_diac_val_data = clean_data(split_length_val_data)

print('Training data length:', len(clean_diac_train_data))
print('Validation data length:', len(clean_diac_val_data))

pprint(clean_diac_train_data[0:2])

Training data length: 12862
Validation data length: 13403


#### Data with diacritics and without punc and numbers

In [191]:
# clean_train_data = [remove_diacritics_line(text) for text in clean_diac_train_data]
# clean_val_data = [remove_diacritics_line(text) for text in clean_diac_val_data]

clean_train_data = clean_data(split_length_train_data, remove_dia=True)
clean_val_data = clean_data(split_length_val_data, remove_dia=True)

print('Training data length:', len(clean_train_data))
print('Validation data length:', len(clean_val_data))

pprint(clean_train_data[0:2])

Training data length: 12862
Validation data length: 13403


#### Train word embedding

In [193]:
data_to_embeddings = clean_train_data + clean_val_data
data_embeddings, tokenizer = train_word_embeddings(data_to_embeddings)

# 6- Custom Data Generator

In [247]:
def get_sentence_classes(sentence):
  x = []
  y = []

  unk_emb = get_word_embeddings([UNK_TOKEN])[0]

  vec = []
  vec = one_hot_vector(char_mapping[SOS_TOKEN],len(char_mapping))
  vec.extend(unk_emb)
  x.append(vec)

  y.append(one_hot_vector(class_mapping[''],len(class_mapping)))
  split_sentence = [i for j in sentence['sentence'].split() for i in (j, ' ')][:-1]

  for word in split_sentence:
    emb2 = get_word_embeddings(remove_diacritics_line(word))

    if (len(emb2) == 0):
        emb = unk_emb
    else:
        emb = emb2[0]

    if word in PUNCTUATIONS:
      emb = unk_emb
    else:
      if (len(emb2) == 0):
        emb = unk_emb
      else:
        emb = emb2[0]

    for index, char in enumerate(word):
      if char not in DIACRITICS:
        vec = []
        vec = one_hot_vector(char_mapping[char],len(char_mapping))
        vec.extend(emb)
        x.append(vec)
        char_diacritic = ''
        sentence_len = len(sentence['sentence'])

        if index + 1 < sentence_len:
          if sentence['sentence'][index + 1] in DIACRITICS:
            char_diacritic = sentence['sentence'][index + 1]

            if index + 2 < sentence_len:
               char_diacritic = char_diacritic + sentence['sentence'][index + 2] if sentence['sentence'][index + 2] in DIACRITICS and (char_diacritic + sentence['sentence'][index + 2] in class_mapping) else sentence['sentence'][index + 2] + char_diacritic if sentence['sentence'][index + 2] in DIACRITICS and (sentence['sentence'][index + 2] + char_diacritic in class_mapping) else char_diacritic
        y.append(one_hot_vector(class_mapping[char_diacritic],len(class_mapping)))

  vec = []
  vec = one_hot_vector(char_mapping[EOS_TOKEN],len(char_mapping))
  vec.extend(unk_emb)
  x.append(vec)
  y.append(one_hot_vector(class_mapping[""],len(class_mapping)))

  emp_s_emb = get_word_embeddings([UNK_TOKEN])[0]
  vec = []
  vec = one_hot_vector(char_mapping[SOSS_TOKEN],len(char_mapping))
  vec.extend(emp_s_emb)
  x.append(vec)
  y.append(one_hot_vector(class_mapping[""],len(class_mapping)))

  for word in split_sentence:
    sens_emb2 = get_word_embeddings([UNK_TOKEN])
    for s in sentence['words']:
      if 'sense' in s:
        if 'word' in s and s['word'] == word:
          sens_emb2 = get_word_embeddings(s['sense'])

      if (len(sens_emb2) == 0):
          sens_emb = unk_emb
      else:
          sens_emb = sens_emb2[0]

      if word in PUNCTUATIONS:
        sens_emb = unk_emb
      else:
        if (len(sens_emb2) == 0):
          sens_emb = unk_emb
        else:
          sens_emb = sens_emb2[0]

      vec = []
      vec = one_hot_vector(char_mapping[PAD_TOKEN], len(char_mapping))
      vec.extend(sens_emb)
      x.append(vec)
      y.append(one_hot_vector(class_mapping[""],len(class_mapping)))

  vec = []
  vec = one_hot_vector(char_mapping[EOSS_TOKEN],len(char_mapping))
  vec.extend(emp_s_emb)
  x.append(vec)
  y.append(one_hot_vector(class_mapping[''],len(class_mapping)))

  assert(len(x) == len(y))

  return x, y

In [195]:
def get_classes(data):
  X = []
  Y = []

  for sentence in data:
    x, y = get_sentence_classes(sentence)
    X.append(x)
    Y.append(y)

  X = np.asarray(X)
  Y = np.asarray(Y)

  return X, Y

class custom_data_generator(Sequence):

    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.data) / float(self.batch_size)))

    def __getitem__(self, index):
        start_index = index * self.batch_size
        end_index = (index + 1) * self.batch_size

        batch = self.data[start_index : end_index]
        X_batch, Y_batch = get_classes(batch)

        max_length_X = np.max([len(x) for x in X_batch])
        max_length_Y = np.max([len(y) for y in Y_batch])

        assert(max_length_X == max_length_Y)

        vec = []
        vec = one_hot_vector(char_mapping[PAD_TOKEN],len(char_mapping))
        vec.extend(get_word_embeddings([PAD_TOKEN])[0])

        X = []
        for x in X_batch:
          padding_length = max_length_X - len(x)
          x = list(x)
          x.extend([vec] * (padding_length))
          X.append(np.asarray(x))

        Y = []
        for y in Y_batch:
          padding_length = max_length_Y - len(y)
          y = list(y)

          y.extend(one_hot_matrix([class_mapping['']] * (padding_length), len(class_mapping)))
          Y.append(np.asarray(y))

        X, Y = np.asarray(X), np.asarray(Y)
        return X, Y

In [198]:
def build_model():
   model = Sequential()
   model.add(InputLayer(input_shape=(None, 199)))

   model.add(Bidirectional(LSTM(units=64,return_sequences=True,kernel_initializer=glorot_normal(seed=500))))
   model.add(Dropout(0.5))
   model.add(Bidirectional(LSTM(units=64,return_sequences=True,kernel_initializer=glorot_normal(seed=500))))
   model.add(Dropout(0.5))
   model.add(Bidirectional(LSTM(units=128,return_sequences=True,kernel_initializer=glorot_normal(seed=500))))
   #model.add(TimeDistributed(Dense(units=128,activation='relu',kernel_initializer=glorot_normal(seed=500))))
   model.add(TimeDistributed(Dense(units=128,activation='relu',kernel_initializer=glorot_normal(seed=500))))
   model.add(TimeDistributed(Dense(units=len(class_mapping),activation='softmax',kernel_initializer=glorot_normal(seed=500))))
   model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
   return model

In [199]:
model = build_model()
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_30 (Bidirect  (None, None, 128)         135168    
 ional)                                                          
                                                                 
 dropout_20 (Dropout)        (None, None, 128)         0         
                                                                 
 bidirectional_31 (Bidirect  (None, None, 128)         98816     
 ional)                                                          
                                                                 
 dropout_21 (Dropout)        (None, None, 128)         0         
                                                                 
 bidirectional_32 (Bidirect  (None, None, 256)         263168    
 ional)                                                          
                                                     

In [200]:
def fit_model(model, epochs, batch_size, train_data, val_data):
    random.shuffle(train_data)
    random.shuffle(val_data)

    checkpoint_path = '/content/drive/MyDrive/ATD-WSD/Bilstm-wsd/epoch{epoch:02d}.ckpt'

    checkpoint_cb = ModelCheckpoint(checkpoint_path, verbose=0)

    training_generator = custom_data_generator(train_data, batch_size)
    val_generator = custom_data_generator(val_data, batch_size)

    history =  model.fit(training_generator,validation_data=val_generator,epochs=epochs,callbacks=[checkpoint_cb])
    return history

In [201]:
history = fit_model(model, 1, 32, clean_diac_train_data, clean_diac_val_data)

  X = np.asarray(X)
  Y = np.asarray(Y)




# 7- Checkpoint

In [202]:
%cd /content/drive/MyDrive/ATD-WSD
joblib.dump(model, 'bilstmWSD.joblib')
filename = 'bilstmWSD.sav'
pickle.dump(model, open(filename, 'wb'))

/content/drive/MyDrive/ATD-WSD


# 8- Predict

In [203]:
def predict(line, model):
    dic = {"sentence": remove_diacritics_line(line['sentence']), "words": line['words']}
    #line['sentence'] = remove_diacritics_line(line['sentence'])

    X, _ = get_classes([dic])
    predictions = model.predict(X).squeeze()

    output = ''
    for char, prediction in zip(line['sentence'], predictions):
        output += char
        if char not in ARABIC_CHAR:
            continue
        output += reverse_class_mapping[np.argmax(prediction)]
    return output

def predict_text(data, model, file_name):
  for idx, line in enumerate(data):
    output = predict(line, model)
    with open(f"{file_name}_out.txt", 'a') as file:
      file.write(output + "\n")

    with open(f"{file_name}_inp.txt", 'a') as file:
      file.write(line['sentence'] + "\n")

In [204]:
model_file_path = 'bilstmWSD.joblib'
model = joblib.load(model_file_path)

In [205]:
test_data = read_json("/content/2528_test_wsd.json")

print('Testing data length:', len(test_data))
print("Test Sample")
pprint(test_data[100])

Testing data length: 2528
Test Sample


In [206]:
predict_text(test_data, model, "Bilstm_wsd")



KeyError: 'word'

# 9- Error Calculation

In [128]:
!pip install diacritization_evaluation

Collecting diacritization_evaluation
  Downloading diacritization_evaluation-0.5-py3-none-any.whl.metadata (945 bytes)
Downloading diacritization_evaluation-0.5-py3-none-any.whl (7.2 kB)
Installing collected packages: diacritization_evaluation
Successfully installed diacritization_evaluation-0.5


In [249]:
def remove_illegal_diac(string):
  ss  = ""
  for s in string:
    if s in ARABIC_CHAR_SPACE:
      ss += s
      continue
    if s in PUNCTUATIONS:
      ss += s
  return ss

def calculate_der(original_path, predicted_path, case_ending=True ):
  with open(original_path, encoding="utf8") as file:
        original_content = file.read()

  with open(predicted_path, encoding="utf8") as file:
        predicted_content = file.read()

  return der.calculate_der(original_content, remove_incorrect_diac(predicted_content), case_ending=case_ending)
  # avg_der = 0
  # number_of_sentences = 0
  # for sentence in zip(original_content, predicted_content):
  #   try:
  #     avg_der += der.calculate_der(sentence[0], sentence[1], case_ending=case_ending)
  #     number_of_sentences += 1
  #   except:
  #     continue

  return avg_der / number_of_sentences

def calculate_wer(original_path, predicted_path, case_ending=False ):
  with open(original_path, encoding="utf8") as file:
        original_content = file.read()

  with open(predicted_path, encoding="utf8") as file:
        predicted_content = file.read()

  return wer.calculate_wer(original_path, predicted_path, case_ending=case_ending, include_non_arabic=True)
  # avg_wer = 0
  # number_of_sentences = 0
  # for sentence in zip(original_content, predicted_content):
  #   try:
  #     avg_wer += wer.calculate_wer(sentence[0], sentence[1], case_ending=case_ending)
  #     number_of_sentences += 1
  #   except:
  #     continue

  # return avg_wer / number_of_sentences

In [250]:
from diacritization_evaluation import wer, der
original_path = "/content/drive/MyDrive/ATD-WSD/Bilstm_wsd_inp.txt"
predicted_path  = "/content/drive/MyDrive/ATD-WSD/Bilstm_wsd_out.txt"

print(calculate_der(original_path, predicted_path))
print(calculate_wer(original_path, predicted_path))

59.72
0.0
