# Required Installations

In [None]:
!pip install transformers datasets arabert

!pip install gensim spacy nltk

!pip install rarfile

# Importing Libraries

In [None]:
import nltk
from nltk import ngrams
import spacy
import gensim
import re
import string
import numpy as np
import pandas as pd
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import zipfile
import os
import rarfile
from pprint import pprint

from transformers import AutoTokenizer, AutoModel, AutoModelWithLMHead, pipeline
from arabert.preprocess import ArabertPreprocessor
from datasets import load_dataset
import torch

from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

nltk.download('punkt')
nltk.download("stopwords")

In [3]:
from google.colab import drive
drive.mount("gdrive")

Mounted at gdrive


# Preprocessing the Data

In [9]:
def cosine_sim(A,B):
  return np.dot(A,B)/(norm(A)*norm(B))

In [10]:
arabic_stopwords = list(set(nltk.corpus.stopwords.words("arabic")))
len(arabic_stopwords)
arabic_stopwords[:10]

['عشر', 'كأنّ', 'هيا', 'أفريل', 'أعلم', 'ثمان', 'يوان', 'طَق', 'صار', 'أغسطس']

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def text_cleaning(text, stop_words=arabic_stopwords):
    """ 
    Args: 
      text : text to be cleaned of extra caracters and remove the stop words
    Returns:
      text : Cleaned text"""

    # Remove punctuation
    arabic_punctuations = '''`÷×؛<>،/:"؟'¦”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = "["+ arabic_punctuations + english_punctuations + "]"
    text = re.sub(punctuations_list,"",text)
    text = " ".join([word for word in text.split() if word not in arabic_stopwords])

    #Remove diacritics and normalize the chars
    text = text.replace("ى", "ي")
    text = text.replace("ؤ", "ء")
    text = text.replace("ئ", "ء")
    text = text.replace("ة", "ه")
    text = text.replace("گ", "ك")   
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = re.sub("[إأآا]", "ا", text)
    text = " ".join([word for word in text.split() if word not in arabic_stopwords])

    arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(arabic_diacritics,"", text)
    
    #Eliminate the repeated caracters
    text = re.sub(r'(.)\1+', r'\1', text)

    # Remove arabic stop words
    text = " ".join([word for word in text.split() if word not in arabic_stopwords])
    
    #Remove extra blank space 
    text = text.strip()

    return text

In [13]:
text = " أَلْعَبُ فى فداء الْمَدْرَسَة"
print(f"The intial text: {text}\nThe cleaned text: {text_cleaning(text)}")

The intial text:  أَلْعَبُ فى فداء الْمَدْرَسَة
The cleaned text: العب فداء المدرسه


# Aravec CBOW model

Download the needed model from this github reposiory:
**https://github.com/bakrianoo/aravec**

In [15]:
def load_spacy_model(path_to_zip_file):
  """
  Args:
    path_to_zip_file : A string that indicates the path to the file that we want to unzip it.
  Returns:
    model: The Spacy model needed to vectorize and tokenize the arabic text.
  """
  cmd = "unzip "+ path_to_zip_file  # command to be executed
  os.system(cmd)

  model_name = path_to_zip_file.split('/')[-1].replace('.zip','.mdl') # Extract model name.
  model = gensim.models.Word2Vec.load(model_name) # Initialize the gensim model using the tsored data in the zipfile.

  os.makedirs("spacyModel",exist_ok=True)  # if it exists, rewrite it.
  model.wv.save_word2vec_format("./spacyModel/aravec.txt")    # Save the word2vec model structure to this directory.

  os.system("gzip ./spacyModel/aravec.txt")
  os.system("python -m spacy init vectors ar ./spacyModel/aravec.txt.gz spacy.aravec.model")  #Initialize the spaCy model using AraVec vectors
  model = spacy.load("./spacy.aravec.model/")

  return model

#load_spacy_model(path_to_zip_file)

In [24]:
path_to_zip_file = "/content/gdrive/MyDrive/ENSI/NLP_Project/full_grams_cbow_100_wiki.zip"
spacy_model = load_spacy_model(path_to_zip_file)

In [25]:
text = text_cleaning("أستيقظ في الصباح الناكر")
embeddings = {token : spacy_model(token).vector for token in text.split()}
embeddings

{'استيقظ': array([-8.5617280e-01,  1.4508170e+00,  3.2188864e+00,  1.8047861e+00,
        -1.6170052e+00, -1.7977168e-01, -1.5039544e+00,  2.9143432e-01,
         2.6498339e-01,  1.5878360e-01, -1.4105649e+00,  7.4322271e-01,
         1.8661531e+00, -3.9422348e-01,  2.0158749e+00, -7.7945518e-01,
        -1.8646410e+00, -9.9249363e-01,  5.2158856e-01, -7.7072787e-01,
         2.6465938e+00, -2.0888078e+00,  2.0678005e-01, -4.8900780e-01,
        -1.5358924e+00, -7.3352861e-01, -6.9328147e-01,  1.7994665e-01,
         2.2669105e-02,  7.3703492e-01, -2.3650987e-01,  4.5114398e-01,
        -1.6006255e+00,  2.7566371e+00, -6.1341310e-01, -3.9447257e-01,
        -2.2452240e+00, -1.1137856e+00,  1.2636138e+00, -1.6905982e+00,
        -1.7127225e+00,  7.6200014e-01,  5.5258608e-01,  1.4505516e+00,
        -1.8226978e+00, -1.8717378e-02, -9.2193508e-01,  5.2964105e-04,
        -7.9593760e-01, -6.9468069e-01,  1.6134089e+00,  1.4142701e-01,
         1.7813829e+00,  4.6771833e-01, -1.0094752e+00

In [55]:
def context_embedding(context,embeddings):

  em_vector = embeddings[context[0]]
  for i in range(1,len(context)):
    em_vector += embeddings[context[i].strip()]

  return em_vector/len(context)

In [27]:
def Calculate_sim_w_c(text,spacy_model):
  """
  This function calculates the similarity between each tokens and the context of the sentence (sentence embedding)
  Args:
    text: The phrase that contains an error semantic word.
    spacy_model : the spacy model needed to clean and vectorize the tokens.
  Returns:
    similarities_token_context : Dictionnary contains the similarities between each token and it's context

  """
  text = text_cleaning(text).strip()

  embeddings = {token : spacy_model(token).vector for token in text.split()}
  similarities_token_context = dict()
  tokens = text.split()

  for token in tokens:
    context = [tk for tk in tokens if tk != token]
    context_emb = context_embedding(context,embeddings)
    cosine = cosine_sim(context_emb, embeddings[token])
    if np.isnan(cosine)==False:
      similarities_token_context[token] = cosine_sim(context_emb, embeddings[token])
    else:
      similarities_token_context[token] = 0
    

  return similarities_token_context

In [29]:
txt = "أستيقظ في الصباح الناكر"

Calculate_sim_w_c(txt,spacy_model)

{'استيقظ': 0.4532253, 'الصباح': 0.4532253, 'الناكر': 0}

In [60]:
def Detect_wrong_token(text,spacy_model=spacy_model):
  """
  Args:
    text: The phrase that contains an error semantic word.
    spacy_model : the spacy model needed to clean and vectorize the tokens.
  Returns:
    string: the wrong word

  """

  similarities_token_context = Calculate_sim_w_c(text,spacy_model)
  sorted_similarity_list = sorted(similarities_token_context.items(), key = lambda item: item[1])
  if sorted_similarity_list[0][1]<=0.3 :
    print("The wrong word in the sentence is : ", sorted_similarity_list[0][0])
    return sorted_similarity_list[0][0]
  else:
    return 'There is no wrong word'

wrong_word = Detect_wrong_token(txt)

The wrong word in the sentence is :  اعصاب


In [61]:
df = pd.read_csv('/content/gdrive/MyDrive/NLP_Project/arabic.txt',sep=",")
df.columns = ['labels','text']
df['labels'] = df['labels'].apply(lambda x:text_cleaning(x))
df.head()

Unnamed: 0,labels,text
0,عاصفه,عاصفة لبنان هي بيروت
1,الناكر,أستيقظ في الصباح الناكر
2,اعصاب,أعصاب الأشجار بها أوراق
3,الوقفه,كتب الطبيب الوقفة الطبية
4,مبتكرا,أنَا أَصْحُو مِن النَوم مبتكرا


In [62]:
df['w_aravec'] = df['text'].apply(lambda token : Detect_wrong_token(token))
df

The wrong word in the sentence is :  عاصفه
The wrong word in the sentence is :  الناكر
The wrong word in the sentence is :  اعصاب
The wrong word in the sentence is :  الوقفه
The wrong word in the sentence is :  مبتكرا
The wrong word in the sentence is :  يسكب
The wrong word in the sentence is :  لبثت
The wrong word in the sentence is :  مبتكرا
The wrong word in the sentence is :  لطيار
The wrong word in the sentence is :  يرعي
The wrong word in the sentence is :  فحوصا
The wrong word in the sentence is :  لكنا
The wrong word in the sentence is :  تعد
The wrong word in the sentence is :  ان
The wrong word in the sentence is :  الحياه
The wrong word in the sentence is :  اعطيت


Unnamed: 0,labels,text,w_aravec
0,عاصفه,عاصفة لبنان هي بيروت,عاصفه
1,الناكر,أستيقظ في الصباح الناكر,الناكر
2,اعصاب,أعصاب الأشجار بها أوراق,اعصاب
3,الوقفه,كتب الطبيب الوقفة الطبية,الوقفه
4,مبتكرا,أنَا أَصْحُو مِن النَوم مبتكرا,مبتكرا
5,يسكب,هو يسكب في منزل كبير,يسكب
6,لبثت,لبثت ثوبا جديدا,لبثت
7,مبتكرا,أنَا أَصْحُو مِن النَوم مبتكرا,مبتكرا
8,لطيار,الأشجار تمثل أعشاشا للطيار,لطيار
9,الالغام,كان الراعي جالس اعلى التل يرعى الالغام,يرعي


In [63]:
# Calculation of the Accuracy

result = sum(df['w_aravec'] == df['labels'])
accuracy = result/len(df)
print("Accuracy: ",accuracy)

Accuracy:  0.625


# Error Detection using Arabert model

In [None]:
model_name = "aubmindlab/bert-base-arabertv02"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)
arabert_model = AutoModel.from_pretrained(model_name)

arabert_prep = ArabertPreprocessor(model_name=model_name)

In [37]:
text = 'عاصفة لبنان هي بيروت'
text = text_cleaning(text)
text_preprocessed = arabert_prep.preprocess(text)
arabert_prep.preprocess(text)

'عاصفه لبنان بيروت'

In [38]:
arabert_input = arabert_tokenizer.encode(text_preprocessed,add_special_tokens=False)
ids_to_tokens = arabert_tokenizer.convert_ids_to_tokens(arabert_input)
print(ids_to_tokens)
print(arabert_input)

['عاص', '##فه', 'لبنان', 'بيروت']
[8424, 6133, 911, 2079]


In [39]:
tensor_input_ids = torch.tensor(arabert_input).unsqueeze(0)
print(tensor_input_ids.shape)
tensor_input_ids

torch.Size([1, 4])


tensor([[8424, 6133,  911, 2079]])

In [40]:
embeddings_array = arabert_model(tensor_input_ids)

In [41]:
embeddings_array[0].shape # batch_size x seq_len x emb_dim

torch.Size([1, 4, 768])

In [42]:
def Extract_embeddings(text,arabert_model):
  """
  This function calculates the similarity between each tokens and the context of the sentence (sentence embedding)
  Args:
    text: The phrase that contains an error semantic word.
    arabert_model : the spacy model needed to clean and vectorize the tokens.
  Returns:
    embeddings : Dictionnary contains the embeddings of each token(word)

  """
  text = text_cleaning(text).strip()
  text_preprocessed = arabert_prep.preprocess(text)
  arabert_input = arabert_tokenizer.encode(text_preprocessed,add_special_tokens=False)
  tensor_input_ids = torch.tensor(arabert_input).unsqueeze(0)
  embeddings_array = arabert_model(tensor_input_ids)
  embeddings_array = embeddings_array[0][0]

  ids_to_tokens = arabert_tokenizer.convert_ids_to_tokens(arabert_input)

  embeddings =dict()
  for i in range(len(ids_to_tokens)):
    if '#' in ids_to_tokens[i]:
      word = ids_to_tokens[i-1]+ids_to_tokens[i].strip('#')
      embeddings[word] = embeddings_array[i]+embeddings_array[i-1]
      if ids_to_tokens[i-1] in embeddings.keys():
        del embeddings[ids_to_tokens[i-1]]
    else:
        embeddings[ids_to_tokens[i]] = embeddings_array[i]
  
  embeddings = {key:value for (key,value) in embeddings.items() if '#' not in key}
  
  return embeddings

In [43]:
text = 'الأشجار تمثل أعشاشا للطيار'
embd = Extract_embeddings(text,arabert_model)
embd.keys()

dict_keys(['الاشجار', 'تمثل', 'اعشاش', 'لطيار'])

In [44]:
def context_embedding_bert(context,embeddings):

  em_vector = embeddings[context[0]].detach().numpy()
  for i in range(1,len(context)):
    em_vector += embeddings[context[i]].detach().numpy()

  return em_vector/len(context)

In [45]:
def AraBert_Detect_wrong_token(text,arabert_model=arabert_model):
  """
  This function calculates the similarity between each tokens and the context of the sentence (sentence embedding)
  Args:
    text: The phrase that contains an error semantic word.
    arabert_model : the arabert model needed to clean and vectorize the tokens.
  Returns:
    similarities_token_context[0][0] : The wrong word that exists in the dictionnary of the similarities between each token and it's context

  """
  embeddings = Extract_embeddings(text,arabert_model)
  similarities_token_context = dict()
  tokens = list(embeddings.keys())
  
  for token in tokens:
    context = [tk for tk in tokens if tk != token]
    context_emb = context_embedding_bert(context,embeddings)
    cosine = cosine_sim(context_emb, embeddings[token].detach().numpy())
    if np.isnan(cosine)==False:
      similarities_token_context[token] = cosine_sim(context_emb, embeddings[token].detach().numpy())
    else:
      similarities_token_context[token] = 0
    

  sorted_similarity_list = sorted(similarities_token_context.items(), key = lambda item: item[1])
  #print(sorted_similarity_list)
  if sorted_similarity_list[0][1]<=0.95 :
    #print("The wrong word in the sentence is : ", sorted_similarity_list[0][0])
    return sorted_similarity_list[0][0]
  else:
    return 'There is no wrong word'

In [46]:
txt = 'أعصاب الأشجار بها أوراق'
wrong_word = AraBert_Detect_wrong_token(txt,arabert_model)
print("The wrong word in the sentence is : ", wrong_word)

The wrong word in the sentence is :  اعصاب


In [47]:
df['w_araBert'] = df['text'].apply(lambda text : AraBert_Detect_wrong_token(text))
df.head()

Unnamed: 0,labels,text,w_aravec,w_araBert
0,عاصفه,عاصفة لبنان هي بيروت,There is no wrong word,عاصفه
1,الناكر,أستيقظ في الصباح الناكر,الناكر,استيقظ
2,اعصاب,أعصاب الأشجار بها أوراق,There is no wrong word,اعصاب
3,الوقفه,كتب الطبيب الوقفة الطبية,There is no wrong word,كتب
4,مبتكرا,أنَا أَصْحُو مِن النَوم مبتكرا,There is no wrong word,النوم


In [48]:
# Calculation of the Accuracy

result = sum(df['w_araBert'] == df['labels'])
accuracy = result/len(df)
print("Accuracy: ",accuracy)

Accuracy:  0.25


# Fill Mask Arabert Model + Aravec

Here, we can use the aravec model to detect semantically incorrect words and then try to use the filler mask to correct this semantic error by taking the **most likely word that can be correct**.

In [None]:
from transformers import AutoModelWithLMHead
modelv01 = AutoModelWithLMHead.from_pretrained('aubmindlab/bert-base-arabertv01')
tokenizerv01 = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv01')
preprocess = ArabertPreprocessor(model_name='aubmindlab/bert-base-arabertv01')

pipeline_fill_mask = pipeline("fill-mask",model=modelv01, tokenizer=tokenizerv01)

In [51]:
text = 'أستيقظ في الصباح  [MASK].'
textv1 = preprocess.preprocess(text)
textv1 = textv1.replace("MASK","[MASK]")
textv1 = textv1.replace(":[",":")
pprint(pipeline_fill_mask(text,))


[{'score': 0.6248967051506042,
  'sequence': 'أستيقظ في الصباح الباكر.',
  'token': 31942,
  'token_str': 'الباكر'},
 {'score': 0.09908975660800934,
  'sequence': 'أستيقظ في الصباح ".',
  'token': 2,
  'token_str': '"'},
 {'score': 0.0931173712015152,
  'sequence': "أستيقظ في الصباح '.",
  'token': 4,
  'token_str': "'"},
 {'score': 0.028514614328742027,
  'sequence': 'أستيقظ في الصباح التالي.',
  'token': 32063,
  'token_str': 'التالي'},
 {'score': 0.0135259497910738,
  'sequence': 'أستيقظ في الصباح صباحا.',
  'token': 22897,
  'token_str': 'صباحا'}]
