In [None]:
# imports
import re
import os
import numpy as np
import tensorflow as tf

In [None]:
# Load the dataset (Google Colab Environment)
! wget "https://www.cs.cmu.edu/%7Eark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip"
! unzip "/content/AQMAR_Arabic_NER_corpus-1.0.zip" -d "/content/corpus"

In [None]:
# Entity Cleaner: Unites entity tags and fixs misspellings 
def tags_cleaner(entity):
  entity = re.sub('\n','',entity) # Remove the newline (\n)
  if entity in ['B-LOC', 'B-MIS', 'B-ORG','B-PER','I-LOC','I-MIS','I-ORG','I-PER','O']:
    return entity
  elif entity in ['B-MIS0','B-MIS1', 'B-MIS2', 'B-MIS3', 'B-MIS-1','B-MIS-2', 'B-MIS1`', 'B-MISS1']:
    return 'B-MIS'
  elif entity in ['I-MIS0','I-MIS1', 'I-MIS2', 'I-MIS3']:
    return 'I-MIS'
  elif entity in ['B-ENGLISH', 'B-SPANISH', 'OO', 'IO']:
    return 'O'
  elif entity == 'I--ORG':
    return 'I-ORG'
  else:
    print('Error with entity:', entity)


# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    # Remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    # Remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    # Trim    
    text = text.strip()

    return text

# Remove empty strings or strings that contains spaces only from sentences
def re_clean(old_sentence, old_tags):
  space_regex = re.compile("\s+")
  new_sentence = []
  new_tags = []
  for j in range(len(old_sentence)):
    # add word if not empty and doesn't contain spaces only
    if old_sentence[j]!="" and space_regex.match(old_sentence[j])==None:
      new_sentence.append(old_sentence[j])
      new_tags.append(old_tags[j])
  
  return new_sentence, new_tags


In [None]:
# Read sentences
sentences = [] 
tags = []
vocab = set()

corpus_path = "/content/corpus/"
for file in os.listdir(corpus_path):
  if file.endswith('.txt'): # Get txt files only
    print('Processing:', file)
    topic = open(corpus_path+file)
    sentence = []
    entity = []
    for line in topic.readlines():
      if line == '\n': # Sentence end
        recleaned = re_clean(sentence, entity)
        sentences.append(recleaned[0].copy())
        tags.append(recleaned[1].copy())
        sentence.clear()
        entity.clear()
      else:
        line = line.split(sep=' ')
        clean_word = clean_str(line[0])       # Cleaning word
        vocab.add(clean_word)                 # Add word to the vocab
        sentence.append(clean_word)           # Add the word
        entity.append(tags_cleaner(line[1]))  # Clean and add entity


print('Done [Sentences:', len(sentences), ', Tags:', len(tags), ', Unique Words:', len(vocab))

Processing: Portugal_football_team.txt
Processing: X_window_system.txt
Processing: Atom.txt
Processing: Computer_Software.txt
Processing: Ibn_Tolun_Mosque.txt
Processing: Islamic_Golden_Age.txt
Processing: Periodic_Table.txt
Processing: Enrico_Fermi.txt
Processing: Summer_Olympics2004.txt
Processing: Light.txt
Processing: Physics.txt
Processing: Internet.txt
Processing: Islamic_History.txt
Processing: Crusades.txt
Processing: Damascus.txt
Processing: Solaris.txt
Processing: Raul_Gonzales.txt
Processing: Ummaya_Mosque.txt
Processing: Nuclear_Power.txt
Processing: Imam_Hussein_Shrine.txt
Processing: Linux.txt
Processing: Razi.txt
Processing: Real_Madrid.txt
Processing: Football.txt
Processing: Christiano_Ronaldo.txt
Processing: Richard_Stallman.txt
Processing: Computer.txt
Processing: Soccer_Worldcup.txt
Done [Sentences: 2687 , Tags: 2687 , Unique Words: 17481


In [None]:
# Make a mapping betwween words and their IDs
word2id = {word:id for  id, word in enumerate(vocab)}
id2word = {id:word for  id, word in enumerate(vocab)}

**The unbalanced dataset problem**

I found that:
- max length of sentence is 290
- 2156 out of 2687 sentences have 40 words or less (80.2% of the data)
- 2485 out of 2687 sentences have 60 words or less (92.2% of the data)
- Sentences have empty strings/words due to cleaning and were tagged 'O'
- The 'O' tag represents 87.3% of the words

----
I re-cleaned the sentences again and removed all empty words and strings that contain spaces only. It made slight difference but it wasn't enough.

Results:
- max length of sentence is 271
- 2240 out of 2687 sentences have 40 words or less (83.4% of the data)
- 2514 out of 2687 sentences have 60 words or less (93.5% of the data)
- No empty strings
- The 'O' tag represents 86.4% of the words

----

I tried to find the ratio of tags according to sentence size. We made bins of size 20 words from 0 to 160 (8 bins)
```
# Percent of each tag per bin
'B-LOC': [1.5, 2.1, 2.2, 2.9, 2.4, 2.4, 3.6, 1.6]
'B-MIS': [5.2, 3.9, 3.5, 1.9, 1.6, 2.0, 2.2, 0.6]
'B-ORG': [0.6, 0.6, 0.4, 0.9, 1.1, 1.6, 1.6, 1.6]
'B-PER': [1.8, 1.9, 1.7, 3.1, 3.7, 4.2, 3.4, 4.7]
'I-LOC': [0.6, 0.9, 1.1, 1.4, 0.8, 0.8, 1.9, 0.2]
'I-MIS': [2.2, 2.2, 2.1, 1.3, 1.6, 2.1, 1.7, 0.6]
'I-ORG': [0.7, 0.7, 0.5, 0.9, 0.9, 1.0, 1.7, 1.3]
'I-PER': [0.9, 1.2, 1.0, 2.1, 2.5, 2.5, 2.0, 7.8]
'O': [86.6, 86.6, 87.4, 85.5, 85.4, 83.5, 82.0, 81.5]
```

As it can be seen, all bins have the same distribution

----
Next step is to choose another padding size, I think size of 40 would be best as most sentences are 40 words or less.

In [None]:
from tensorflow.keras.utils import to_categorical

# Sentence encoder
def encode_sentence(old_sentence):
  encoded_sentence = []
  for word in old_sentence:
    try:
      encoded_sentence.append(word2id[word])
    except KeyError:
      encoded_sentence.append(0) # A dummy digit for out of vocab

  return encoded_sentence

# Encode Tags
tags_encoding = {
    'B-LOC':0,
    'B-MIS':1,
    'B-ORG':2,
    'B-PER':3,
    'I-LOC':4,
    'I-MIS':5,
    'I-ORG':6,
    'I-PER':7,
    'O':8
  }
def encode_tags(old_tags):
  new_tags = [tags_encoding[tag] for tag in old_tags]
  new_tags = to_categorical(y = new_tags, num_classes=9)
  return new_tags

In [None]:
# Encoding
sentences_encoded = []
tags_encoded = []

for i in range(len(sentences)):
  sentences_encoded.append(encode_sentence(sentences[i]))
  tags_encoded.append(encode_tags(tags[i]))

In [None]:
from keras.preprocessing.sequence import pad_sequences

# Padding
MAX_SEQUENCE_LENGTH = 40

sentences_padded = pad_sequences(sequences = sentences_encoded, 
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32', 
                                 padding='post',
                                 truncating='post',
                                 value = 0)
tags_padded = pad_sequences(sequences = tags_encoded, 
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32', 
                                 padding='post',
                                 truncating='post',
                                 value = np.array([0., 0., 0., 0., 0., 0., 0., 0., 1.]))

In [None]:
from sklearn.model_selection import train_test_split

# Splitting data
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences_padded, 
                                                                              tags_padded, 
                                                                              train_size=0.8, 
                                                                              random_state=42)

In [None]:
# Download AraVec (Word2Vec Model) by Abu Bakr Soliman, Kareem Eissa, and Samhaa R.El-Beltagy.
! wget "https://archive.org/download/aravec2.0/wiki_cbow_300.zip"
! unzip "/content/wiki_cbow_300.zip" -d "/content/word2vec_model"

--2022-05-18 21:05:17--  https://archive.org/download/aravec2.0/wiki_cbow_300.zip
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip [following]
--2022-05-18 21:05:17--  https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip
Resolving ia803107.us.archive.org (ia803107.us.archive.org)... 207.241.232.157
Connecting to ia803107.us.archive.org (ia803107.us.archive.org)|207.241.232.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 364888893 (348M) [application/zip]
Saving to: ‘wiki_cbow_300.zip.1’

wiki_cbow_300.zip.1   4%[                    ]  16.49M   994KB/s    eta 1m 51s ^C
Archive:  /content/wiki_cbow_300.zip
replace /content/word2vec_model/wikipedia_cbow_300? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
import gensim

# Load the Word2Vec model
weights_path = "/content/word2vec_model/wikipedia_cbow_300"
araVec = gensim.models.Word2Vec.load(weights_path)

# Testing
most_similar = araVec.wv.most_similar( "محمد" )
for term, score in most_similar:
	print(term, score)

لمحمد 0.726012110710144
احمد 0.7142194509506226
عبدالرحمن 0.6745274066925049
ابراهيم 0.6723851561546326
مهدي 0.6686975955963135
محمود 0.664846658706665
يحي 0.637116551399231
اسماعيل 0.6307213306427002
حموده 0.6287057995796204
عبدالحميد 0.6267551183700562


In [None]:
# Create an embedding matrix for the embedding layer
num_words = len(vocab)
embed_size, = araVec['محمود'].shape
embedding_matrix = np.zeros(shape=(num_words, embed_size))

for word, id in word2id.items():
  try:
    embedding_matrix[id] = araVec[word]
  except KeyError:
    embedding_matrix[id] = np.zeros(embed_size)

embedding_matrix.shape

  This is separate from the ipykernel package so we can avoid doing imports until
  


(17481, 300)

In [None]:
from tensorflow.keras.layers import LSTM, Input, Dense, Embedding, TimeDistributed
from tensorflow.keras.models import Model, Sequential

tf.keras.backend.clear_session() # Makes sure old model was deleted if exists

lstm_model = Sequential()
# Adding Layers
lstm_model.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))
lstm_model.add(Embedding(input_dim = len(vocab),              # Vocabulary Size (number of unique words for training)
                        output_dim = embed_size,              # Length of the vector for each word (embedding dimension)
                        input_length = MAX_SEQUENCE_LENGTH,   # Maximum length of a sequence
                        weights = [embedding_matrix],         # Send the needed AraVec Weights
                        trainable = False))

lstm_model.add(LSTM(units = embed_size, 
                    return_sequences=True,
                    dropout=0.5, 
                    recurrent_dropout=0.5))
lstm_model.add(TimeDistributed(Dense(9, activation='softmax')))

# Compile the model
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999), 
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 300)           5244300   
                                                                 
 lstm (LSTM)                 (None, 40, 300)           721200    
                                                                 
 time_distributed (TimeDistr  (None, 40, 9)            2709      
 ibuted)                                                         
                                                                 
Total params: 5,968,209
Trainable params: 723,909
Non-trainable params: 5,244,300
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


In [None]:
lstm_model.fit(train_sentences, 
               train_labels, 
               validation_split=0.15, 
               batch_size = 10,
               epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb1ee3e0790>

In [None]:
lstm_model.evaluate(test_sentences, test_labels)



[0.16745567321777344, 0.9480947852134705]

In [None]:
def lstm_predict(sentence:str):
  sentence = sentence.split(sep=' ')
  # Keeping track of words so not to process 40 words every time
  word_count = len(sentence) 
  # Clean sentence
  ready_sentence = [clean_str(word) for word in sentence]
  # Encode sentence
  ready_sentence = encode_sentence(ready_sentence)
  # Padding sentence
  ready_sentence = pad_sequences(sequences = [ready_sentence], 
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32', 
                                 padding='post',
                                 truncating='post',
                                 value = 0)
  
  # Predict and return actual words only
  predictions = lstm_model.predict(ready_sentence)[0][0:word_count]

  i = 0
  for prediction in predictions:
    tags_onehot = {
      'B-LOC':np.array([1., 0., 0., 0., 0., 0., 0., 0., 0.]),
      'B-MIS':np.array([0., 1., 0., 0., 0., 0., 0., 0., 0.]),
      'B-ORG':np.array([0., 0., 1., 0., 0., 0., 0., 0., 0.]),
      'B-PER':np.array([0., 0., 0., 1., 0., 0., 0., 0., 0.]),
      'I-LOC':np.array([0., 0., 0., 0., 1., 0., 0., 0., 0.]),
      'I-MIS':np.array([0., 0., 0., 0., 0., 1., 0., 0., 0.]),
      'I-ORG':np.array([0., 0., 0., 0., 0., 0., 1., 0., 0.]),
      'I-PER':np.array([0., 0., 0., 0., 0., 0., 0., 1., 0.]),
      'O':np.array([0., 0., 0., 0., 0., 0., 0., 0., 1.]),
    }
    tags_scores = {
      'B-LOC':0,
      'B-MIS':0,
      'B-ORG':0,
      'B-PER':0,
      'I-LOC':0,
      'I-MIS':0,
      'I-ORG':0,
      'I-PER':0,
      'O':0
    }
    for tag in list(tags_onehot.keys()):
      tags_scores[tag] = np.linalg.norm(tags_onehot[tag] - prediction)
    
    
    print(sentence[i],':',min(tags_scores, key=tags_scores.get))
    i+=1



In [None]:
lstm_predict('منشئ المسجد هو أحمد بن طولون مؤسس الدولة الطولونية في مصر والشام، تعود أصوله إلى قبيلة التغزغز التركية، وكانت أُسرته تقيم في بخاري.')

منشئ : O
المسجد : O
هو : O
أحمد : B-PER
بن : I-PER
طولون : I-PER
مؤسس : O
الدولة : B-LOC
الطولونية : O
في : O
مصر : O
والشام، : B-LOC
تعود : O
أصوله : O
إلى : O
قبيلة : O
التغزغز : O
التركية، : O
وكانت : O
أُسرته : O
تقيم : O
في : O
بخاري. : B-LOC


In [None]:
lstm_predict('محمود حسام ذهب الي مسجد')

محمود : B-PER
حسام : B-PER
ذهب : O
الي : O
مسجد : O


In [None]:
lstm_predict('عبدالرحمن خالد ذهب الي مسجد')

عبدالرحمن : O
خالد : O
ذهب : O
الي : O
مسجد : O


In [None]:
lstm_predict('دمشق، هي عاصمة الجمهورية العربية السورية، ومركز محافظة دمشق. وهي إحدى أقدم مدن العالم مع تاريخ غير منقطع منذ أحد عشر ألف عام تقريبًا، وأقدم مدينة - عاصمة في العالم. أصبحت عاصمة منطقة سوريا منذ عام 635.')

دمشق، : B-LOC
هي : O
عاصمة : O
الجمهورية : O
العربية : O
السورية، : O
ومركز : O
محافظة : O
دمشق. : B-LOC
وهي : O
إحدى : O
أقدم : O
مدن : O
العالم : O
مع : O
تاريخ : O
غير : O
منقطع : O
منذ : O
أحد : O
عشر : O
ألف : O
عام : O
تقريبًا، : O
وأقدم : O
مدينة : O
- : O
عاصمة : O
في : O
العالم. : O
أصبحت : O
عاصمة : O
منطقة : O
سوريا : B-LOC
منذ : O
عام : O
635. : O
