# **Downloading required data**

In [2]:
!wget --load-cookies /tmp/cookies.txt "http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip" -O AQMAR_Arabic_NER_corpus-1.0.zip && rm -rf /tmp/cookies.txt
import zipfile
with zipfile.ZipFile("AQMAR_Arabic_NER_corpus-1.0.zip", 'r') as zip_ref:
    zip_ref.extractall("AQMAR_UnZip")

Cannot open cookies file ‘/tmp/cookies.txt’: No such file or directory
--2023-04-07 02:43:59--  http://www.cs.cmu.edu/~ark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7815886 (7.5M) [application/zip]
Saving to: ‘AQMAR_Arabic_NER_corpus-1.0.zip’


2023-04-07 02:44:07 (999 KB/s) - ‘AQMAR_Arabic_NER_corpus-1.0.zip’ saved [7815886/7815886]



# **Downloading Word2Vec**

In [3]:
!wget --load-cookies /tmp/cookies.txt "https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_cbow_300_twitter.zip" -O full_uni_cbow_300_twitter.zip && rm -rf /tmp/cookies.txt
!unzip full_uni_cbow_300_twitter.zip 

Cannot open cookies file ‘/tmp/cookies.txt’: No such file or directory
--2023-04-07 02:44:07--  https://bakrianoo.ewr1.vultrobjects.com/aravec/full_uni_cbow_300_twitter.zip
Resolving bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)... 108.61.0.122, 2001:19f0:0:22::100
Connecting to bakrianoo.ewr1.vultrobjects.com (bakrianoo.ewr1.vultrobjects.com)|108.61.0.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2833686412 (2.6G) [application/zip]
Saving to: ‘full_uni_cbow_300_twitter.zip’


2023-04-07 02:45:14 (40.7 MB/s) - ‘full_uni_cbow_300_twitter.zip’ saved [2833686412/2833686412]

Archive:  full_uni_cbow_300_twitter.zip
  inflating: full_uni_cbow_300_twitter.mdl  
  inflating: full_uni_cbow_300_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_uni_cbow_300_twitter.mdl.wv.vectors.npy  


# **Split file to text and label**

In [4]:
import numpy as np
def split_text_label(filename):
    f = open(filename, encoding="utf-8")
    split_labeled_text = []
    sentence = []
    for line in f:
        # If the line is empty or starts with "-DOCSTART" or is a newline character, 
        # then add the current sentence to split_labeled_text list if it is not empty, 
        # then reset the sentence list and continue to the next line.
        if len(line) == 0 or line.startswith("-DOCSTART") or line[0] == "\n":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        # Split the current line by tab character and store it in splits list
        splits = line.split("\t")
        # Remove the newline character from the last element of the splits list and store it in label variable
        label = splits[-1].rstrip("\n")

        #"O" indicates that there is no entity present in that word/token.
        #"B-MIS1" and "B-MIS2" indicate the beginning of an entity of type "MIS1" or "MIS2", respectively.
        #"I-ORG" indicates a continuation of an entity of type "ORG".
        if label == "IO" or label == "OO":
            label = "O"
        elif label == "B-MIS-1" or label == "B-MIS1`" or label == "B-MISS1":
            label = "B-MIS1"
        elif label == "B-MIS-2":
            label = "B-MIS2"
        elif label == "I--ORG":
            label = "I-ORG"
        elif label == "":
            continue
        sentence.append([splits[0], label])
    # If the last sentence is not empty, then add it to split_labeled_text list
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    return split_labeled_text


split_train = split_text_label("./AQMAR_UnZip/featureFiles/test/all.test.features.txt")
split_test = split_text_label("./AQMAR_UnZip/featureFiles/dev/all.dev.features.txt")


In [5]:
print(np.array(split_train).shape)
print(np.array(split_test).shape)

(1976,)
(711,)


  print(np.array(split_train).shape)
  print(np.array(split_test).shape)


# **Preprocessing data**

In [6]:
import re


def normalize_arabic(text):
    # This function replaces Arabic letters that have different forms in Unicode
    # with the standard form.
    # For example, it replaces Alef variants and Hamza with the standard Alef letter.


    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text


def preprocessing(data_list, model):
    # This function preprocesses the input data to prepare it for training or testing.
    # It performs the following steps:
    # - Remove non-Arabic words from each sentence
    # - Normalize the Arabic words
    # - Convert the labels to indices
    # - Pad the sequences to make them of equal length

    # loop over data_list and remove all non arabic words
    new_data_list = []
    for sentence in data_list:
        newSentence = []
        for word, label in np.array(sentence).reshape(-1, 2):
            if all(ord(char) in range(0x0600, 0x06FF) for char in word):
                newSentence.append([word, label])
        new_data_list.append(newSentence)

    # normalize arabic words (remove elongation)
    new_data_list = [
        [[normalize_arabic(word), label] for word, label in sent] for sent in new_data_list
    ]

    # get unique words and labels
    labelSet = set()
    wordSet = set()
    for data in new_data_list:
        for word, label in np.array(data).reshape(-1, 2):
            labelSet.add(label)
            wordSet.add(word.lower())

    # get index for each label
    labels2index = {t: i for i, t in enumerate(labelSet)}

    # X = data | Y = labels
    X = [[w[0] for w in s] for s in new_data_list]
    Y = [[w[1] for w in s] for s in new_data_list]

    # get w2v for the words that are available in the model
    X_w2v = []
    Y_w2v = [] 
    for j in range(len(X)):
        sentence_x = []
        sentence_y = []
        for i in range(len(X[j])):
            # if word is in the model add it to sentence []
            if X[j][i] in model.wv:
                sentence_x.append(np.array(model.wv[X[j][i]]))
                sentence_y.append(Y[j][i])
        # add w2v list (sentence_x) to list of all w2v's (X_train)
        Y_w2v.append(sentence_y)
        X_w2v.append(sentence_x)

    # Get biggest sentence len
    #     largest_sen = max(len(sen) for sen in X_w2v)

    # 180 is the max sentence length in the training data 
    # the max in testing data was smaller than 180 so we used the largest.
    largest_sen = 180
    print("biggest sentence has {} words".format(largest_sen))

    # padding (making each sentence the same len as largest_sen)
    new_X = []
    temp_y = []
    for seq_x, seq_y in zip(X_w2v, Y_w2v):
        new_seq_x = []
        new_seq_y = []
        for i in range(largest_sen):
            try:
                new_seq_x.append(seq_x[i])
                new_seq_y.append(seq_y[i])
            except:
                # if the sentence is smaller than the largest sentence
                # we will add words with zeros vector to the small sentence and make its label =("0")
                new_seq_x.append(np.zeros((300)))
                new_seq_y.append("O")

        new_X.append(new_seq_x)
        temp_y.append(new_seq_y)
        
    print(np.array(new_X).shape,np.array(temp_y).shape)
    
    trunc_X= []
    trunc_y=[]

    for x, y in zip(new_X, temp_y):
        count = 0
        for i in range(largest_sen):
            if y[i] == "O":
                count+=1
        if count != largest_sen:
            trunc_X.append(x)
            trunc_y.append(y)
    
    print(np.array(trunc_X).shape,np.array(trunc_y).shape)

    # convert labels to indices
    new_y = []
    for s in trunc_y:
        temp = []
        for i, _ in enumerate(s):
            temp.append(labels2index[s[i]])
        new_y.append(temp)

    return np.array(trunc_X), np.array(new_y), labels2index

# **Loading w2v model**

In [7]:
import gensim

model_w2v = gensim.models.Word2Vec.load('./full_uni_cbow_300_twitter.mdl')

# **Apply preprocessing**

In [20]:
import numpy as np

X_train,y_train,labels2index=preprocessing(split_train,model_w2v)
X_test,y_test,labels2index=preprocessing(split_test,model_w2v)

biggest sentence has 180 words
(1976, 180, 300) (1976, 180)
(1192, 180, 300) (1192, 180)
biggest sentence has 180 words
(711, 180, 300) (711, 180)
(494, 180, 300) (494, 180)


# **One-Hot encoding**

In [21]:
from keras.utils.np_utils import to_categorical

y_train = to_categorical(y_train, 28)
y_test = to_categorical(y_test, 28)

print("x_train shape: ",X_train.shape,"y_train shape: ",y_train.shape)


x_train shape:  (1192, 180, 300) y_train shape:  (1192, 180, 28)


In [22]:
vocabs = model_w2v.wv.index_to_key
vocab_size=len(vocabs)

# **Building the model**

In [23]:
import keras
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import concatenate
from keras.layers import Reshape
from keras.layers import SpatialDropout1D
from keras.layers import Dropout
from keras import Model, Input
from keras.layers import TimeDistributed, Bidirectional
from keras.regularizers import l2

In [24]:
input_word = Input(shape=(180, 300))
lstm1 = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.2))(input_word)
dropout1 = Dropout(0.2)(lstm1)
lstm2 = LSTM(units=64, return_sequences=True, recurrent_dropout=0.2)(dropout1)
dropout2 = Dropout(0.2)(lstm2)
dense_layer = TimeDistributed(Dense(units=10))(dropout2)
out = TimeDistributed(Dense(28, activation='softmax'))(dense_layer)
model = Model(input_word, out)

In [25]:
model.compile(
    optimizer="Adam", loss="categorical_crossentropy", metrics=["Accuracy"]
)

In [26]:
print(X_train.shape,y_train.shape,y_test.shape)


(1192, 180, 300) (1192, 180, 28) (494, 180, 28)


In [27]:
model.fit(X_train, y_train, epochs=5, batch_size=64,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb0af016640>

In [28]:
model.evaluate(X_test, y_test)



[0.19609348475933075, 0.9732006192207336]

In [29]:
y_pred = model.predict(X_test)



In [30]:
y_pred = np.argmax(y_pred, axis= -1).flatten()
y_test = np.argmax(y_test, axis= -1).flatten()

In [31]:
from sklearn.metrics import f1_score

f1=f1_score(y_test, y_pred,average="weighted")
print("F1 Score = ",f1)

F1 Score =  0.9600159142288612


In [33]:
model.save("my_model.h5")