## These are the similarities and some other tests for the main corpus (diacritized)

### In this corpus the Hamzah (ء) was removed from most of the words (this was in the original corpus)

In [None]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bidi.algorithm import get_display
import arabic_reshaper


In [None]:
# Load word2vec model
model = KeyedVectors.load("main_corpus_diacritized_word2vec_no_stopwords.model")

## Similarity Tests
Similarity tests measure how closely related two words are in the vector space of the word embedding model. The similarity score ranges from 0 to 1, where a score closer to 1 indicates higher similarity between the words.

In [10]:
word_pairs_diacritized = [('مَلِكُ', 'مَلِكَةُ'),
                          ('رَجُلٌ', 'امْراَةً'),
                          ('مَدينَةُ', 'بَلَدُ'),
                          ('سَيّارَةٌ', 'شاحِنَةٌ'),
                          ('اَخْذَ', 'أَعْطَى'),
                          ('سَلْبُ', 'نَهْبُ'),
                          ('خَرَجَ', 'اكُلَ'),
                          ('مَكْتَبُ', 'مَطْعَمُ'),
                          ('اسْتَقْبَلَ', 'اسْتَفْهَمَ'),
                          ('تَبادُلَ', 'تَناظَرَ'),
                          ('تَقارُبٌ', 'تَباعُدُ'),
                          ('خُروجُ', 'بُطونَ'),
                          ('اُسْتُلْهِمَ', 'اسْتَبْشَرَ'),
                          ('هَرَبَ', 'أَعْرَبَ'),
                          ('الحْ', 'اَصْبَحَ'),
                          ('نَدَّدَ', 'هَدَّدَ'),
                          ('مَرَحَ', 'سَرَحْ'),
                          ('بَطاطا', 'بابَ'),
                          ('قَرا', 'كُتُبَ'),
                          ('تُفّاحَةً', 'شاحِنَةٌ')]

for w1, w2 in word_pairs_diacritized:
    similarity = model.wv.similarity(w1, w2)
    print(f"similarity between {w1} and {w2} is: {similarity}")


similarity between مَلِكُ and مَلِكَةُ is: 0.38107478618621826
similarity between رَجُلٌ and امْراَةً is: 0.5956306457519531
similarity between مَدينَةُ and بَلَدُ is: 0.47277987003326416
similarity between سَيّارَةٌ and شاحِنَةٌ is: 0.7797516584396362
similarity between اَخْذَ and أَعْطَى is: 0.5882586240768433
similarity between سَلْبُ and نَهْبُ is: 0.26046836376190186
similarity between خَرَجَ and اكُلَ is: 0.3807080090045929
similarity between مَكْتَبُ and مَطْعَمُ is: 0.2825270891189575
similarity between اسْتَقْبَلَ and اسْتَفْهَمَ is: 0.10714126378297806
similarity between تَبادُلَ and تَناظَرَ is: 0.14738820493221283
similarity between تَقارُبٌ and تَباعُدُ is: 0.351602703332901
similarity between خُروجُ and بُطونَ is: 0.25128287076950073
similarity between اُسْتُلْهِمَ and اسْتَبْشَرَ is: 0.3010907769203186
similarity between هَرَبَ and أَعْرَبَ is: 0.22660547494888306
similarity between الحْ and اَصْبَحَ is: 0.31345146894454956
similarity between نَدَّدَ and هَدَّدَ is: 0.74

## Analogy Tests
Analogy tests evaluate how well the word embedding model captures relationships between word pairs. This test involves finding a word that is related to a given word in the same way another pair of words is related. The scores indicate how well the model predicts the relationships, with higher scores showing better performance.


In [12]:
def analogy_test(model, word_a, word_b, word_c):
    try:
        result = model.wv.most_similar(positive=[word_b, word_c], negative=[word_a])
        return result[0]
    except KeyError as e:
        return str(e)

# List of analogies to test
analogies = [
    ('مَلِكُ', 'مَلِكَةُ', 'رَجُلٌ'),  # king : queen :: man : woman
    ('رَجُلٌ', 'رِجالٌ', 'امْراَةً'),  # man : men :: woman : women
    ('باريسْ', 'فَرَنْسا', 'روما'),  # Paris : France :: Rome : Italy
     ('يَمْشي', 'مَشَى', 'يَجْري'),  # walk : walked :: run : ran
    ('الرّياضُ', 'السُّعوديَّةُ', 'أَبوظَبْي'), # Riyadh : Saudi Arabia :: Abu Dhabi
    ('صَغيرٌ', 'أَصْغَرُ', 'كَبيرٌ') # small : smallest :: big : biggest
]



for a, b, c in analogies:
    result = analogy_test(model, a, b, c)
    print(f"{a} : {b} :: {c} : {result[0]} (score: {result[1]})")


مَلِكُ : مَلِكَةُ :: رَجُلٌ : شابٌّ (score: 0.5492724776268005)
رَجُلٌ : رِجالٌ :: امْراَةً : بِزَوْجاتٍ (score: 0.6441929340362549)
باريسْ : فَرَنْسا :: روما : ايطاليا (score: 0.7297506332397461)
يَمْشي : مَشَى :: يَجْري : جَرَى (score: 0.636982262134552)
الرّياضُ : السُّعوديَّةُ :: أَبوظَبْي : دُبَيّْ (score: 0.6729374527931213)
صَغيرٌ : أَصْغَرُ :: كَبيرٌ : أَكْبَرَ (score: 0.6578704118728638)


## Odd One Out Test

To further evaluate the performance of the word2vec model, I conducted "odd one out" tests. These tests involve identifying the word that doesn't match the context of a given set of words. This helps to assess how well the model understands the semantic relationships and distinctions between words. Here are some examples: 


In [24]:
words = ['مَلِكُ', 'مَلِكَةُ', 'رَجُلٌ', 'سَيّارَةٌ']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['مَلِكُ', 'مَلِكَةُ', 'رَجُلٌ', 'سَيّارَةٌ'] is: سَيّارَةٌ


In [25]:
words = ['أَحَبُّ', 'كَرِهٌ', 'عِشْقَ', 'حُبُّ']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['أَحَبُّ', 'كَرِهٌ', 'عِشْقَ', 'حُبُّ'] is: كَرِهٌ


In [26]:
words = ['طَبيبٌ', 'مُمَرِّضٌ', 'مُسْتَشْفَى', 'صَّيْدَليّْ']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['طَبيبٌ', 'مُمَرِّضٌ', 'مُسْتَشْفَى', 'صَّيْدَليّْ'] is: مُمَرِّضٌ


In [27]:
words = ['كِتابُ', 'كُتِبتْ', 'يَكْتُبُ', 'كُتُبَ']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['كِتابُ', 'كُتِبتْ', 'يَكْتُبُ', 'كُتُبَ'] is: كِتابُ


## Word Vector Arithmetic

To evaluate the semantic relationships captured by the word2vec model, I performed word vector arithmetic. This involves using the model to solve analogies by adding and subtracting word vectors. The resulting vector is then compared to the vectors in the model to find the closest match. This test demonstrates the model's ability to understand and manipulate word relationships in a meaningful way.


In [29]:
# Define a function to perform analogy tasks
def perform_analogy(pos1, neg, pos2, model):
    result = model.wv.most_similar(positive=[pos1, pos2], negative=[neg], topn=1)
    return result


In [31]:
examples = [
            ('مَلِكُ', 'رَجُلٌ', 'امْراَةً'),
            ('باريسْ', 'فَرَنْسا', 'ايطاليا'),
            ('طَبيبٌ', 'رَجُلٌ', 'امْراَةً'),
            ('مُديرُ', 'رَجُلٌ', 'امْراَةً'),
            ('الرّياضُ', 'السُّعوديَّةُ', 'مِصْرُ')]

for pos1, neg, pos2 in examples:
    result = perform_analogy(pos1, neg, pos2, model)
    print(f"'{pos1}' - '{neg}' + '{pos2}' = {result[0][0]}, score: {result[0][1]}")

'مَلِكُ' - 'رَجُلٌ' + 'امْراَةً' = مَلَكِها, score: 0.7091062068939209
'باريسْ' - 'فَرَنْسا' + 'ايطاليا' = باريسَ, score: 0.7254602313041687
'طَبيبٌ' - 'رَجُلٌ' + 'امْراَةً' = مُمَرِّضَةٍ, score: 0.671596348285675
'مُديرُ' - 'رَجُلٌ' + 'امْراَةً' = مُديرَةُ, score: 0.7435939908027649
'الرّياضُ' - 'السُّعوديَّةُ' + 'مِصْرُ' = بسيونى, score: 0.6481846570968628
