# Word Embedding Using FastText

## Data Preparation and Normalizatoin

In [None]:
# Download religious text from course repo
!git clone https://github.com/language-ml/course-nlp-ir-1-text-exploring

In [None]:
# Install required libs
!pip install -Uq camel_tools

In [None]:
# Import all required libs for preprocessing
import re
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_alef_bw
from camel_tools.utils.normalize import normalize_alef_hsb
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
from camel_tools.utils import normalize

import tqdm

from pathlib import Path

import pandas as pd

In [None]:
# Load data
# Quran
religious_dir = "/content/course-nlp-ir-1-text-exploring/exploring-datasets/religious_text"

df_quran = pd.read_csv(f'{religious_dir}/quranic_data/id_text_with_orthographies.txt', sep='\t', header=None)
verse_complete_dict = pd.Series(df_quran[1].tolist(), index=df_quran[0]).to_dict()

# Nahj
df_nahj = pd.read_csv(f'{religious_dir}/nahj-al-balaqa/Nahj Al-Balaqa.txt', sep='\t',header=None)
nahj_complete_dict = pd.Series(df_nahj[1].tolist(), index=df_nahj[0]).to_dict()

# Sahifa
sahife_text=Path(f'{religious_dir}/Saheefa/sahife_sajjadieh.txt').read_text().split('\n')
sahife_complete_dict = [re.sub('[(][۰-۹]+[)]','', item) for item in sahife_text if item.startswith('(')]

In [None]:
print(f"""
          *Quranic Example: {verse_complete_dict['2##186']} 
          *Nahj Example: {nahj_complete_dict['2##186']} 
          *Sahifa Example: {sahife_complete_dict[12]}""")

In [None]:
def normalize_arabic(sentence):

    # Normalize alef variants to 'ا'
    sent_norm = normalize_unicode(sentence)
    
    sent_norm = normalize_alef_bw(sent_norm)
    # Normalize alef variants to 'ا'
    sent_norm = normalize_alef_ar(sentence)

    # Normalize alef maksura 'ى' to yeh 'ي'
    sent_norm = normalize_alef_maksura_ar(sent_norm)

    # Normalize teh marbuta 'ة' to heh 'ه'
    sent_norm = normalize_teh_marbuta_ar(sent_norm)
    return dediac_ar(sent_norm)


def removeSigns(token):
  if token in ['ۖ', 'ۚ', 'ۗ'] or len(token)<3:
    return False
  else :
    return True

def itterator(matrix):
  for i in range(len(matrix)):
    matrix[i] = list(filter(removeSigns, matrix[i]))
  return matrix

In [None]:
# Data Normalization
verse_complete_dict_nrmlz = {k:normalize_arabic(v) for k,v in tqdm.tqdm(verse_complete_dict.items())}
nahj_complete_dict_nrmlz = {k:normalize_arabic(v) for k,v in tqdm.tqdm(nahj_complete_dict.items())}
sahife_complete_dict_nrmlz = [normalize_arabic(item) for item in tqdm.tqdm(sahife_complete_dict)]

In [None]:
print(f"""
          *Quranic(pure): {verse_complete_dict['2##186']}
          *Quranic(Processed): {verse_complete_dict_nrmlz['2##186']}
      
          *Nahj(pure): {nahj_complete_dict['2##186']}
          *Nahj(Processed): {nahj_complete_dict_nrmlz['2##186']} 

          *Sahifa(pure): {sahife_complete_dict[12]}
          *Sahifa(Processed): {sahife_complete_dict_nrmlz[34]} """)

Data Tokenization

In [None]:
# Data Tokenization
quranic_tokenized = [sents.split() for sents in tqdm.tqdm(verse_complete_dict_nrmlz.values())]
nahj_tokenized = [sents.split() for sents in tqdm.tqdm(nahj_complete_dict_nrmlz.values())]
sahife_tokenized = [sents.split() for sents in tqdm.tqdm(sahife_complete_dict_nrmlz)]

# Remove Specific Chars and remove word with len<3
quranic_tokenized = itterator(quranic_tokenized)
nahj_tokenized =itterator(nahj_tokenized)
sahife_tokenized = itterator(nahj_tokenized)

In [None]:
print(f"""
          Tokenized:
            *Quranic Example: {quranic_tokenized[43]}
            *Nahj Example: {nahj_tokenized[43]}
            *Sahifa Example: {sahife_tokenized[43]}""")

## FastText Embedding
Install required libs and download required data

In [None]:
!pip install fasttext

In [None]:
# download and unzip pretrained arabic model

vectorModelDir = '/content/'
# ! wget -P /content/ https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz
# ! gunzip /content/cc.ar.300.vec.gz

Import rquired packages

In [None]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

from sklearn.metrics.pairwise import cosine_similarity

import numpy as np 

Load pretrained arabic vector (downlaoded from fasttext data storage)

In [None]:
model = KeyedVectors.load_word2vec_format(datapath(f'{vectorModelDir}cc.ar.300.vec'), binary=False)

In [None]:

"""
get vector for each token from pretrained arabic model
if each token doesnt exist in model we assign 0 to its vector
@param tokens an array of sentence word
@return an array of tuples with format (token, vector)
"""
def W2V4corpus(token,models):
  w2vToken = token
  k=0 
  for i in range(len(token)):
    for j in range(len(token[i])):
      word = token[i][j]
      models = model
      if word in models:
        w2vToken[i][j] = (word,models[word])
      else:
        # list of words that dosent exist in the vector
        print(word)
        k=k+1
        w2vToken[i][j] = (word,0)
  print(f'number of words that dosent exist in the pretrained fasttext model: {k}')
  return w2vToken

quranic_w2v = W2V4corpus(quranic_tokenized,model)
nahj_w2v = W2V4corpus(nahj_tokenized,model)
# sahifeh_w2v = W2V4corpus(sahife_tokenized,model)

In [None]:
# calculate avg of word vectors in sentence 
def calculateSentenceVector(wordVector):
  return np.mean( np.array(wordVector), axis=0 )

# convert doc to vector
def doc2vec(corpus):
  temp = []
  for i in range(len(corpus)):
    temp.append(calculateSentenceVector([l[1] for l in corpus[i]]))
  return temp

In [None]:
verse_complete_dict_nrmlze=[]
verse_complete_dict_nrmlze.append(list(verse_complete_dict_nrmlz.values()))

nahj_complete_dict_nrmlze=[]
nahj_complete_dict_nrmlze.append(list(nahj_complete_dict_nrmlz.values()))

# sahife_complete_dict_nrmlze=[]
# sahife_complete_dict_nrmlze.append(list(sahife_complete_dict_nrmlz.values()))

In [None]:
verse_complete_dict_nrmlze.append(doc2vec(quranic_w2v))
nahj_complete_dict_nrmlze.append(doc2vec(nahj_w2v))
# sahife_complete_dict_nrmlze.append()

In [None]:
# a test vector query
query = [-0.0417    ,  0.01735   , -0.02275   ,  0.0172    ,  0.18415   ,
        0.002525  , -0.0532    ,  0.10465   ,  0.003075  , -0.12132499,
        0.046725  , -0.15392499,  0.0511    ,  0.036025  , -0.0806    ,
        0.0206    , -0.014075  ,  0.0328    ,  0.0699    , -0.0173    ,
       -0.05935   , -0.0393    , -0.011925  , -0.042025  ,  0.007625  ,
        0.028275  ,  0.017     , -0.017     ,  0.0067    , -0.003225  ,
       -0.04735   ,  0.0046    ,  0.0737    , -0.044025  , -0.1193    ,
        0.01255   ,  0.008     , -0.011275  ,  0.065875  , -0.00385   ,
       -0.0565    , -0.004     , -0.04055   ,  0.041325  ,  0.072025  ,
       -0.053225  , -0.009     ,  0.006375  , -0.03895   ,  0.00415   ,
        0.010825  ,  0.051175  ,  0.027925  ,  0.0647    , -0.0468    ,
        0.00615   , -0.05085   ,  0.06835   , -0.046975  ,  0.008875  ,
       -0.076475  , -0.0143    ,  0.12365   , -0.007675  , -0.04905   ,
       -0.060775  , -0.071025  ,  0.018975  , -0.09615   ,  0.006625  ,
        0.00425   ,  0.016325  ,  0.02305   ,  0.102375  ,  0.0041    ,
        0.035475  ,  0.033675  , -0.034625  , -0.019225  , -0.0075    ,
        0.0188    ,  0.01985   ,  0.01845   ,  0.09330001,  0.04525   ,
        0.063225  , -0.0042    ,  0.030125  , -0.00615   ,  0.0305    ,
       -0.048275  , -0.037725  , -0.15822499, -0.057425  ,  0.032125  ,
        0.034575  , -0.012625  ,  0.0177    , -0.043325  , -0.000375  ,
       -0.0091    , -0.016325  ,  0.051625  , -0.01885   , -0.028675  ,
       -0.0035    ,  0.100725  ,  0.01365   ,  0.0144    , -0.09635   ,
        0.024525  ,  0.0395    , -0.0642    , -0.033275  ,  0.025525  ,
       -0.0359    ,  0.039425  ,  0.057125  , -0.012325  , -0.00725   ,
       -0.03285   ,  0.0121    , -0.054225  ,  0.028075  , -0.047875  ,
       -0.032     , -0.022275  ,  0.01855   , -0.0538    , -0.011225  ,
        0.032875  ,  0.05465   , -0.06935   , -0.028425  ,  0.061475  ,
        0.1463    , -0.004275  , -0.012375  , -0.029775  , -0.032975  ,
        0.001125  ,  0.015525  , -0.002075  , -0.05595   ,  0.061825  ,
        0.034425  ,  0.07555   ,  0.002325  ,  0.0076    , -0.0675    ,
       -0.0176    , -0.007025  ,  0.0332    ,  0.006175  ,  0.03495   ,
        0.04405   , -0.018925  ,  0.06587499,  0.01975   , -0.07065   ,
        0.01415   , -0.05730001, -0.18477501,  0.044975  , -0.04485   ,
       -0.013275  ,  0.004525  , -0.024275  , -0.044125  ,  0.14465   ,
        0.0218    ,  0.096975  , -0.028325  , -0.0477    , -0.033025  ,
        0.022375  , -0.09285   , -0.020825  , -0.024925  ,  0.01135   ,
        0.007175  , -0.00945   ,  0.0219    , -0.05575   ,  0.011525  ,
       -0.004325  ,  0.002     ,  0.0245    ,  0.03255   ,  0.1248    ,
       -0.01445   ,  0.002075  , -0.047775  ,  0.024975  ,  0.0366    ,
        0.005375  ,  0.01735   ,  0.0441    , -0.056575  , -0.07214999,
        0.069775  ,  0.041775  ,  0.02095   ,  0.005     ,  0.01585   ,
        0.1201    ,  0.0442    ,  0.03355   , -0.0055    ,  0.0032    ,
       -0.014375  ,  0.062875  ,  0.12917499,  0.014275  , -0.008225  ,
        0.043525  , -0.01255   , -0.05225   , -0.02575   ,  0.022625  ,
        0.01995   ,  0.13445   , -0.028725  ,  0.040375  ,  0.082675  ,
       -0.00955   ,  0.02225   ,  0.033675  ,  0.014525  , -0.065625  ,
        0.098525  , -0.0844    , -0.0007    ,  0.0607    , -0.0512    ,
       -0.02285   , -0.00135   ,  0.077825  , -0.012025  , -0.0123    ,
       -0.00545   , -0.09477501,  0.03585   ,  0.016825  , -0.162875  ,
        0.0461    ,  0.0363    , -0.079075  ,  0.012575  , -0.07435   ,
        0.035275  ,  0.01915   , -0.024175  , -0.07385   ,  0.03185   ,
        0.003925  ,  0.025075  ,  0.000625  ,  0.012675  ,  0.020825  ,
       -0.01565   ,  0.0155    , -0.04005   ,  0.0621    , -0.13335   ,
       -0.007175  , -0.019975  ,  0.032875  , -0.0837    ,  0.027225  ,
       -0.001075  , -0.07555   , -0.054675  ,  0.0209    , -0.07645   ,
       -0.09375   , -0.001375  , -0.07345   , -0.03495   , -0.061125  ,
        0.037675  , -0.10332499,  0.001775  ,  0.02775   ,  0.023175  ,
        0.03305   ,  0.044     ,  0.0097    , -0.017775  , -0.007875  ,
       -0.01      ,  0.0012    , -0.031425  , -0.01365   ,  0.01945   ,
        0.0583    ,  0.11995   ,  0.003475  ,  0.003075  , -0.0917    ]

In [None]:
"""
calculate similarity between input query and all words in vecList using cosine similarity
@param query the input query vector
@param vecList a list of vectors(words)
@return a list of similarity rank
"""

def calculateSimilarity(qurey, vecList):
  temp=[]
  k=0
  for i in vecList[1]:
    print(k)
    k = k+1
    temp.append(cosine_similarity([qurey], [i]))
  return temp

"""
return similarity matrix
@param quran if true search in quranic verse to find similar verse
@param nahj if true search in nahj ol balaghe verse to find similar verse
@param sahifa if true search in sahifa verse to find similar verse
"""  
def mostSimilar(queryVector, quran=True, nahj=False, sahifa=False):
  similarVector=[]
  if quran:
    similarVector.append(calculateSimilarity(queryVector, verse_complete_dict_nrmlze))
  if nahj:
    similarVector.append(calculateSimilarity(queryVector, nahj_complete_dict_nrmlze))
  if sahifa:
    similarVector.append(calculateSimilarity(queryVector, nahj_complete_dict_nrmlze))
  return similarVector

In [None]:
# a list for similarity between input query and all sentence
similarity = mostSimilar(query, quran=True)

In [None]:
#sort similarity and select top 10
sortedSimilarity = sorted(similarity, key=lambda x: x[1])
sortedSimilarity[:10]