In [48]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import seaborn as sns

In [59]:
# Load word2vec model
# model = KeyedVectors.load("/home/safeer.alyubary/diacritized-Arabic-corpuss/data_files/models/word2vec_model.model")
model = KeyedVectors.load("/home/safeer.alyubary/Semantic-Coherence/data_files/models/main_corpus_word2vec.model")


## Similarity Tests
Similarity tests measure how closely related two words are in the vector space of the word embedding model. The similarity score ranges from 0 to 1, where a score closer to 1 indicates higher similarity between the words.

In [60]:
word_pairs = [
    ('ملك', 'ملكة'),        # ('king', 'queen')
    ('رجل', 'امراة'),       # ('man', 'woman')
    ('مدينة', 'بلد'),       # ('city', 'country')
    ('سيارة', 'شاحنة'),     # ('car', 'truck')
    ('اخذ', 'اعطى'),        # ('took', 'gave')
    ('سلب', 'نهب'),         # ('looted', 'plundered')
    ('بطاطا', 'باب'),       # ('potato', 'door')
    ('قرا', 'كتب'),         # ('read', 'wrote')
    ('خرج', 'اكل'),         # ('went out', 'ate')
    ('مكتب', 'مطعم'),       # ('office', 'restaurant')
    ('استقبل', 'استفهم'),   # ('received', 'inquired')
    ('تبادل', 'تناظر'),     # ('exchanged', 'corresponded / were analogous')
    ('تقارب', 'تباعد'),     # ('converged', 'diverged')
    ('خروج', 'بطون'),       # ('exit', 'bellies')
    ('استلهم', 'استبشر'),   # ('inspired', 'rejoiced / felt hopeful')
    ('هرب', 'اعرب'),        # ('escaped', 'expressed')
    ('الح', 'اصبح'),        # ('insisted', 'became')
    ('ندد', 'هدد'),         # ('condemned', 'threatened')
    ('مرح', 'سرح'),         # ('had fun', 'roamed / wandered')
    ('تفاحة', 'شاحنة'),     # ('apple', 'truck')
]

for w1, w2 in word_pairs:
    similarity = model.wv.similarity(w1, w2)
    print(f"similarity between {w1} and {w2} is: {similarity}")


similarity between ملك and ملكة is: 0.635229229927063
similarity between رجل and امراة is: 0.7393512725830078
similarity between مدينة and بلد is: 0.5166240334510803
similarity between سيارة and شاحنة is: 0.8326432704925537
similarity between اخذ and اعطى is: 0.5262339115142822
similarity between سلب and نهب is: 0.4805051386356354
similarity between بطاطا and باب is: 0.21413539350032806
similarity between قرا and كتب is: 0.5189897418022156
similarity between خرج and اكل is: 0.30785199999809265
similarity between مكتب and مطعم is: 0.35948678851127625
similarity between استقبل and استفهم is: 0.2515539526939392
similarity between تبادل and تناظر is: 0.26939356327056885
similarity between تقارب and تباعد is: 0.5875691175460815
similarity between خروج and بطون is: 0.13542751967906952
similarity between استلهم and استبشر is: 0.3984569013118744
similarity between هرب and اعرب is: 0.29490986466407776
similarity between الح and اصبح is: 0.18107405304908752
similarity between ندد and هدد is: 0.6

## Analogy Tests
Analogy tests evaluate how well the word embedding model captures relationships between word pairs. This test involves finding a word that is related to a given word in the same way another pair of words is related. The scores indicate how well the model predicts the relationships, with higher scores showing better performance.


In [61]:
def analogy_test(model, word_a, word_b, word_c):
    try:
        result = model.wv.most_similar(positive=[word_b, word_c], negative=[word_a])
        return result[0]
    except KeyError as e:
        return str(e)

# List of analogies to test
analogies = [
    ('ملك', 'ملكة', 'رجل'),  # king : queen :: man : woman
    ('رجل', 'رجال', 'امراة'),  # man : men :: woman : women
    ('باريس', 'فرنسا', 'روما'),  # Paris : France :: Rome : Italy
    ('يمشي', 'مشى', 'يجري'),  # walk : walked :: run : ran
    ('الرياض', 'السعودية', 'ابوظبي'), # Riyadh: KSA :: Abu Dhabi
    ('صغير', 'اصغر', 'كبير')  # small : smallest :: big : biggest

]

for a, b, c in analogies:
    result = analogy_test(model, a, b, c)
    print(f"{a} : {b} :: {c} : {result[0]} (score: {result[1]})")


ملك : ملكة :: رجل : امراة (score: 0.7206618785858154)
رجل : رجال :: امراة : نساء (score: 0.7702681422233582)
باريس : فرنسا :: روما : ايطاليا (score: 0.8238568305969238)
يمشي : مشى :: يجري : اقتصر (score: 0.6323454976081848)
الرياض : السعودية :: ابوظبي : الاماراتية (score: 0.7888111472129822)
صغير : اصغر :: كبير : اكبر (score: 0.8239786624908447)


## Odd One Out Test

To further evaluate the performance of the word2vec model, I conducted "odd one out" tests. These tests involve identifying the word that doesn't match the context of a given set of words. This helps to assess how well the model understands the semantic relationships and distinctions between words. Here are some examples: 


In [62]:
words = ['ملك', 'ملكة', 'رجل', 'سيارة']
# ["king", "queen", "man", "car"]
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['ملك', 'ملكة', 'رجل', 'سيارة'] is: سيارة


In [63]:
words = ['أحب', 'كره', 'عشق', 'حب']
['love (verb)', 'hate', 'adore', 'love (noun)']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['أحب', 'كره', 'عشق', 'حب'] is: كره


In [64]:
words = ['طبيب', 'ممرض', 'مستشفى', 'صيدلي']
['doctor', 'nurse', 'hospital', 'pharmacist']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['طبيب', 'ممرض', 'مستشفى', 'صيدلي'] is: ممرض


In [65]:
words = ['مصر', 'فرنسا', 'ألمانيا', 'القاهرة']
# ['Egypt', 'France', 'Germany', 'Cairo']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['مصر', 'فرنسا', 'ألمانيا', 'القاهرة'] is: فرنسا


In [66]:
words = ['الاثنين', 'الثلاثاء', 'الأربعاء', 'يناير']
# ['Monday', 'Tuesday', 'Wednesday', 'January']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['الاثنين', 'الثلاثاء', 'الأربعاء', 'يناير'] is: الاثنين


## Word Vector Arithmetic

To evaluate the semantic relationships captured by the word2vec model, I performed word vector arithmetic. This involves using the model to solve analogies by adding and subtracting word vectors. The resulting vector is then compared to the vectors in the model to find the closest match. This test demonstrates the model's ability to understand and manipulate word relationships in a meaningful way.


In [67]:
def perform_analogy(pos1, neg, pos2, model):
    result = model.wv.most_similar(positive=[pos1, pos2], negative=[neg], topn=1)
    return result

In [69]:
examples = [
    ('ملك', 'رجل', 'امراة'),      # ('king', 'man', 'woman')
    ('باريس', 'فرنسا', 'ايطاليا'), # ('Paris', 'France', 'Italy')
    ('طبيب', 'رجل', 'امراة'),     # ('doctor', 'man', 'woman')
    ('مدير', 'رجل', 'امراة'),     # ('manager/director', 'man', 'woman')
    ('الرياض', 'السعودية', 'مصر')  # ('Riyadh', 'Saudi Arabia', 'Egypt')
]


for pos1, neg, pos2 in examples:
    result = perform_analogy(pos1, neg, pos2, model)
    print(f"'{pos1}' - '{neg}' + '{pos2}' = {result[0][0]}, score: {result[0][1]}")

'ملك' - 'رجل' + 'امراة' = مملكة, score: 0.7344213724136353
'باريس' - 'فرنسا' + 'ايطاليا' = تورينو, score: 0.8501540422439575
'طبيب' - 'رجل' + 'امراة' = طبيبة, score: 0.7830656170845032
'مدير' - 'رجل' + 'امراة' = مديرة, score: 0.7995114326477051
'الرياض' - 'السعودية' + 'مصر' = القاهرة, score: 0.8165403604507446
