In [3]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Load word2vec model
# model = KeyedVectors.load("/home/safeer.alyubary/diacritized-Arabic-corpuss/data_files/models/word2vec_model.model")
model = KeyedVectors.load("/home/safeer.alyubary/Semantic-Coherence/New-Corpus/models/new_corp_word2vec.model")




## Similarity Tests
Similarity tests measure how closely related two words are in the vector space of the word embedding model. The similarity score ranges from 0 to 1, where a score closer to 1 indicates higher similarity between the words.

In [44]:
word_pairs = [
    ('ملك', 'ملكة'),        # ('king', 'queen')
    ('رجل', 'امرأة'),       # ('man', 'woman')
    ('مدينة', 'بلد'),       # ('city', 'country')
    ('سيارة', 'شاحنة'),     # ('car', 'truck')
    ('أخذ', 'أعطى'),        # ('took', 'gave')
    ('سلب', 'نهب'),         # ('looted', 'plundered')
    ('بطاطا', 'باب'),       # ('potato', 'door')
    ('قرأ', 'كتب'),         # ('read', 'wrote')
    ('خرج', 'أكل'),         # ('went out', 'ate')
    ('مكتب', 'مطعم'),       # ('office', 'restaurant')
    ('استقبل', 'استفهم'),   # ('received', 'inquired')
    ('تبادل', 'تناظر'),     # ('exchanged', 'corresponded / were analogous')
    ('تقارب', 'تباعد'),     # ('converged', 'diverged')
    ('خروج', 'بطون'),       # ('exit', 'bellies')
    ('استلهم', 'استبشر'),   # ('inspired', 'rejoiced / felt hopeful')
    ('هرب', 'أعرب'),        # ('escaped', 'expressed')
    ('ألح', 'أصبح'),        # ('insisted', 'became')
    ('ندد', 'هدد'),         # ('condemned', 'threatened')
    ('مرح', 'سرح'),         # ('had fun', 'roamed / wandered')
    ('تفاحة', 'شاحنة'),     # ('apple', 'truck')
]

for w1, w2 in word_pairs:
    similarity = model.wv.similarity(w1, w2)
    print(f"similarity between {w1} and {w2} is: {similarity}")


similarity between ملك and ملكة is: 0.6190299987792969
similarity between رجل and امرأة is: 0.646419107913971
similarity between مدينة and بلد is: 0.5709558129310608
similarity between سيارة and شاحنة is: 0.8251516222953796
similarity between أخذ and أعطى is: 0.592511773109436
similarity between سلب and نهب is: 0.6218847036361694
similarity between بطاطا and باب is: 0.21397875249385834
similarity between قرأ and كتب is: 0.6785322427749634
similarity between خرج and أكل is: 0.2967958152294159
similarity between مكتب and مطعم is: 0.3980117440223694
similarity between استقبل and استفهم is: 0.16805686056613922
similarity between تبادل and تناظر is: 0.289451003074646
similarity between تقارب and تباعد is: 0.5851466059684753
similarity between خروج and بطون is: 0.22319623827934265
similarity between استلهم and استبشر is: 0.2918385863304138
similarity between هرب and أعرب is: 0.30460622906684875
similarity between ألح and أصبح is: 0.31934577226638794
similarity between ندد and هدد is: 0.72031

## Analogy Tests
Analogy tests evaluate how well the word embedding model captures relationships between word pairs. This test involves finding a word that is related to a given word in the same way another pair of words is related. The scores indicate how well the model predicts the relationships, with higher scores showing better performance.


In [45]:
def analogy_test(model, word_a, word_b, word_c):
    try:
        result = model.wv.most_similar(positive=[word_b, word_c], negative=[word_a])
        return result[0]
    except KeyError as e:
        return str(e)

# List of analogies to test
analogies = [
    ('ملك', 'ملكة', 'رجل'),  # king : queen :: man : woman
    ('رجل', 'رجال', 'امرأة'),  # man : men :: woman : women
    ('باريس', 'فرنسا', 'روما'),  # Paris : France :: Rome : Italy
    ('يمشي', 'مشى', 'يجري'),  # walk : walked :: run : ran
    ('الرياض', 'السعودية', 'أبوظبي'), # Riyadh: KSA :: Abu Dhabi
    ('صغير', 'اصغر', 'كبير')  # small : smallest :: big : biggest

]

for a, b, c in analogies:
    result = analogy_test(model, a, b, c)
    print(f"{a} : {b} :: {c} : {result[0]} (score: {result[1]})")


ملك : ملكة :: رجل : امرأة (score: 0.7246387600898743)
رجل : رجال :: امرأة : نساء (score: 0.7070936560630798)
باريس : فرنسا :: روما : إيطاليا (score: 0.9166126251220703)
يمشي : مشى :: يجري : جرى (score: 0.8233355283737183)
الرياض : السعودية :: أبوظبي : الإمارات (score: 0.8265878558158875)
صغير : اصغر :: كبير : اكبر (score: 0.6937293410301208)


## Odd One Out Test

To further evaluate the performance of the word2vec model, I conducted "odd one out" tests. These tests involve identifying the word that doesn't match the context of a given set of words. This helps to assess how well the model understands the semantic relationships and distinctions between words. Here are some examples: 


In [46]:
words = ['ملك', 'ملكة', 'رجل', 'سيارة']
# ["king", "queen", "man", "car"]
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['ملك', 'ملكة', 'رجل', 'سيارة'] is: سيارة


In [47]:
words = ['أحب', 'كره', 'عشق', 'حب']
['love (verb)', 'hate', 'adore', 'love (noun)']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['أحب', 'كره', 'عشق', 'حب'] is: كره


In [48]:
words = ['طبيب', 'ممرض', 'مستشفى', 'صيدلي']
['doctor', 'nurse', 'hospital', 'pharmacist']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['طبيب', 'ممرض', 'مستشفى', 'صيدلي'] is: مستشفى


In [49]:
words = ['مصر', 'فرنسا', 'ألمانيا', 'القاهرة']
# ['Egypt', 'France', 'Germany', 'Cairo']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['مصر', 'فرنسا', 'ألمانيا', 'القاهرة'] is: القاهرة


In [50]:
words = ['الاثنين', 'الثلاثاء', 'الأربعاء', 'يناير']
# ['Monday', 'Tuesday', 'Wednesday', 'January']
odd_word = model.wv.doesnt_match(words)
print(f"The odd word out in {words} is: {odd_word}")

The odd word out in ['الاثنين', 'الثلاثاء', 'الأربعاء', 'يناير'] is: يناير


## Word Vector Arithmetic

To evaluate the semantic relationships captured by the word2vec model, I performed word vector arithmetic. This involves using the model to solve analogies by adding and subtracting word vectors. The resulting vector is then compared to the vectors in the model to find the closest match. This test demonstrates the model's ability to understand and manipulate word relationships in a meaningful way.


In [51]:
def perform_analogy(pos1, neg, pos2, model):
    result = model.wv.most_similar(positive=[pos1, pos2], negative=[neg], topn=1)
    return result

In [52]:
examples = [
    ('ملك', 'رجل', 'امرأة'),      # ('king', 'man', 'woman')
    ('باريس', 'فرنسا', 'إيطاليا'), # ('Paris', 'France', 'Italy')
    ('طبيب', 'رجل', 'امرأة'),     # ('doctor', 'man', 'woman')
    ('مدير', 'رجل', 'امرأة'),     # ('manager/director', 'man', 'woman')
    ('الرياض', 'السعودية', 'مصر')  # ('Riyadh', 'Saudi Arabia', 'Egypt')
]


for pos1, neg, pos2 in examples:
    result = perform_analogy(pos1, neg, pos2, model)
    print(f"'{pos1}' - '{neg}' + '{pos2}' = {result[0][0]}, score: {result[0][1]}")

'ملك' - 'رجل' + 'امرأة' = ملكة, score: 0.7220497727394104
'باريس' - 'فرنسا' + 'إيطاليا' = روما, score: 0.9102364778518677
'طبيب' - 'رجل' + 'امرأة' = طبيبة, score: 0.8451983332633972
'مدير' - 'رجل' + 'امرأة' = مديرة, score: 0.7479687333106995
'الرياض' - 'السعودية' + 'مصر' = القاهرة, score: 0.8994781970977783
