# __Word Analogy Evaluations of Word Representation Task__

## # Evaluation of __Arabic Wikipedia__ Word Embedding Models (With Bots):

In [1]:
import warnings
import pandas as pd
from pathlib import Path
from embeddings_utils import *
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

w2v_models = [str(x) for x in Path('Word-Embeddings/Word2Vec-Models/ar/').glob('arwiki_20230101_w2v_*_bots.model')]
glove_models = [str(x) for x in Path('Word-Embeddings/GloVe-Models/ar/').glob('arwiki*_bots_glove.vectors')]
fasttext_models = [str(x) for x in Path('Word-Embeddings/fastText-Models/ar/').glob('arwiki_20230101_fasttext_*_bots.model')]

models = w2v_models + fasttext_models + glove_models

files = [str(x) for x in Path('.').glob('ASAD/Arab_States_Analogy_Dataset_All.txt')]

print(f"## EVALUATION OF ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH BOTS):")

top_ks=[1, 5, 10]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")

        if 'glove' in model_name: model = load_model(model, 'glove')
        else: model = load_model(model)

        if 'glove' in model_name:
            model_params = f'<vocab_size={format(len(model.key_to_index)-1, ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")
        else:
            model_params = f'<vocab_size={format(len(model.wv.key_to_index), ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            analogy_test_set = pd.read_csv(file, sep=" ", header=None)
            analogy_test_set.columns = ["example1", "example2", "query", "answer"]

            if 'glove' in model_name:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, model_format='glove', top_k=top_k, axis=1)
            else:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, top_k=top_k, axis=1)

            analogy_test_set['is_accurate'] = analogy_test_set.apply(lambda row: 1 if row.answer in row.pred_answer else 0, axis=1)
            accuracy = analogy_test_set['is_accurate'].sum()/len(analogy_test_set) * 100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arwiki_wem_results_bots.csv', index=False) 

## EVALUATION OF ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH BOTS):
   @ MODEL: arwiki_20230101_w2v_skipgram_bots.model
   @ PARAMS: <vocab_size=2,122,484, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 53.82%
   @ MODEL: arwiki_20230101_w2v_cbow_bots.model
   @ PARAMS: <vocab_size=2,122,484, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 53.88%
   @ MODEL: arwiki_20230101_fasttext_cbow_bots.model
   @ PARAMS: <vocab_size=2,122,484, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 21.97%
   @ MODEL: arwiki_20230101_fasttext_skipgram_bots.model
   @ PARAMS: <vocab_size=2,122,484, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 39.67%
   @ MODEL: arwiki_20230101_bots_glove.vectors
   @ PARAMS: <vocab_size=2,122,484, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURA

## # Evaluation of __Arabic Wikipedia__ Word Embedding Models (With No Bots):

In [2]:
import warnings
import pandas as pd
from pathlib import Path
from embeddings_utils import *
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

w2v_models = [str(x) for x in Path('Word-Embeddings/Word2Vec-Models/ar/').glob('arwiki_20230101_w2v_*_nobots.model')]
glove_models = [str(x) for x in Path('Word-Embeddings/GloVe-Models/ar/').glob('arwiki*_nobots_glove.vectors')]
fasttext_models = [str(x) for x in Path('Word-Embeddings/fastText-Models/ar/').glob('arwiki_20230101_fasttext_*_nobots.model')]

models = w2v_models + fasttext_models + glove_models 

files = [str(x) for x in Path('.').glob('ASAD/Arab_States_Analogy_Dataset_All.txt')]

print(f"## EVALUATION OF ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH NO BOTS):")

top_ks=[1, 5, 10]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")

        if 'glove' in model_name: model = load_model(model, 'glove')
        else: model = load_model(model)

        if 'glove' in model_name:
            model_params = f'<vocab_size={format(len(model.key_to_index)-1, ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")
        else:
            model_params = f'<vocab_size={format(len(model.wv.key_to_index), ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            analogy_test_set = pd.read_csv(file, sep=" ", header=None)
            analogy_test_set.columns = ["example1", "example2", "query", "answer"]

            if 'glove' in model_name:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, model_format='glove', top_k=top_k, axis=1)
            else:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, top_k=top_k, axis=1)

            analogy_test_set['is_accurate'] = analogy_test_set.apply(lambda row: 1 if row.answer in row.pred_answer else 0, axis=1)
            accuracy = analogy_test_set['is_accurate'].sum()/len(analogy_test_set) * 100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arwiki_wem_results_nobots.csv', index=False) 

## EVALUATION OF ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH NO BOTS):
   @ MODEL: arwiki_20230101_w2v_skipgram_nobots.model
   @ PARAMS: <vocab_size=2,102,375, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 54.47%
   @ MODEL: arwiki_20230101_w2v_cbow_nobots.model
   @ PARAMS: <vocab_size=2,102,375, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 53.22%
   @ MODEL: arwiki_20230101_fasttext_skipgram_nobots.model
   @ PARAMS: <vocab_size=2,102,375, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 39.87%
   @ MODEL: arwiki_20230101_fasttext_cbow_nobots.model
   @ PARAMS: <vocab_size=2,102,375, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 22.76%
   @ MODEL: arwiki_20230101_nobots_glove.vectors
   @ PARAMS: <vocab_size=2,102,375, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
  

## # Evaluation of __Egyptian Arabic Wikipedia__ Word Embedding Models (*only* With Bots):
We dropped the Egyptian Arabic Wikipedia (_from the evaluation with no bots)_ due to having an insignificant number of bot-generated articles, only 15 articles.

In [3]:
import warnings
import pandas as pd
from pathlib import Path
from embeddings_utils import *
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

w2v_models = [str(x) for x in Path('Word-Embeddings/Word2Vec-Models/arz/').glob('arzwiki_20230101_w2v_*.model')]
glove_models = [str(x) for x in Path('Word-Embeddings/GloVe-Models/arz/').glob('arzwiki_*_glove.vectors')]
fasttext_models = [str(x) for x in Path('Word-Embeddings/fastText-Models/arz/').glob('arzwiki_20230101_fasttext_*.model')]

models = w2v_models + fasttext_models + glove_models

files = [str(x) for x in Path('.').glob('ASAD/Arab_States_Analogy_Dataset_All.txt')]

print(f"## EVALUATION OF EGYPTIAN ARABIC WIKIPEDIA WORD EMBEDDING MODELS (ONLY WITH BOTS):")

top_ks=[1, 5, 10]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")

        if 'glove' in model_name: model = load_model(model, 'glove')
        else: model = load_model(model)

        if 'glove' in model_name:
            model_params = f'<vocab_size={format(len(model.key_to_index)-1, ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")
        else:
            model_params = f'<vocab_size={format(len(model.wv.key_to_index), ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            analogy_test_set = pd.read_csv(file, sep=" ", header=None)
            analogy_test_set.columns = ["example1", "example2", "query", "answer"]

            if 'glove' in model_name:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, model_format='glove', top_k=top_k, axis=1)
            else:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, top_k=top_k, axis=1)

            analogy_test_set['is_accurate'] = analogy_test_set.apply(lambda row: 1 if row.answer in row.pred_answer else 0, axis=1)
            accuracy = analogy_test_set['is_accurate'].sum()/len(analogy_test_set) * 100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arzwiki_wem_results_bots.csv', index=False) 

## EVALUATION OF EGYPTIAN ARABIC WIKIPEDIA WORD EMBEDDING MODELS (ONLY WITH BOTS):
   @ MODEL: arzwiki_20230101_w2v_cbow.model
   @ PARAMS: <vocab_size=487,466, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 13.88%
   @ MODEL: arzwiki_20230101_w2v_skipgram.model
   @ PARAMS: <vocab_size=487,466, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 5.0%
   @ MODEL: arzwiki_20230101_fasttext_skipgram.model
   @ PARAMS: <vocab_size=487,466, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 10.13%
   @ MODEL: arzwiki_20230101_fasttext_cbow.model
   @ PARAMS: <vocab_size=487,466, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 11.64%
   @ MODEL: arzwiki_20230101_glove.vectors
   @ PARAMS: <vocab_size=487,466, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 0.53%
   @ MOD

## # Evaluation of __Moroccan Arabic Wikipedia__ Word Embedding Models (With Bots):

In [4]:
import warnings
import pandas as pd
from pathlib import Path
from embeddings_utils import *
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

w2v_models = [str(x) for x in Path('Word-Embeddings/Word2Vec-Models/ary/').glob('arywiki_20230101_w2v_*_bots.model')]
glove_models = [str(x) for x in Path('Word-Embeddings/GloVe-Models/ary/').glob('arywiki*_bots_glove.vectors')]
fasttext_models = [str(x) for x in Path('Word-Embeddings/fastText-Models/ary/').glob('arywiki_20230101_fasttext_*_bots.model')]

models = w2v_models + fasttext_models + glove_models

files = [str(x) for x in Path('.').glob('ASAD/Arab_States_Analogy_Dataset_All.txt')]

print(f"## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH BOTS):")

top_ks=[1, 5, 10]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")

        if 'glove' in model_name: model = load_model(model, 'glove')
        else: model = load_model(model)

        if 'glove' in model_name:
            model_params = f'<vocab_size={format(len(model.key_to_index)-1, ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")
        else:
            model_params = f'<vocab_size={format(len(model.wv.key_to_index), ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            analogy_test_set = pd.read_csv(file, sep=" ", header=None)
            analogy_test_set.columns = ["example1", "example2", "query", "answer"]

            if 'glove' in model_name:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, model_format='glove', top_k=top_k, axis=1)
            else:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, top_k=top_k, axis=1)

            analogy_test_set['is_accurate'] = analogy_test_set.apply(lambda row: 1 if row.answer in row.pred_answer else 0, axis=1)
            accuracy = analogy_test_set['is_accurate'].sum()/len(analogy_test_set) * 100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arywiki_wem_results_bots.csv', index=False) 

## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH BOTS):
   @ MODEL: arywiki_20230101_w2v_cbow_bots.model
   @ PARAMS: <vocab_size=72,467, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 1.91%
   @ MODEL: arywiki_20230101_w2v_skipgram_bots.model
   @ PARAMS: <vocab_size=72,467, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 2.11%
   @ MODEL: arywiki_20230101_fasttext_cbow_bots.model
   @ PARAMS: <vocab_size=72,467, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 1.71%
   @ MODEL: arywiki_20230101_fasttext_skipgram_bots.model
   @ PARAMS: <vocab_size=72,467, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 3.68%
   @ MODEL: arywiki_20230101_bots_glove.vectors
   @ PARAMS: <vocab_size=72,467, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 0

## # Evaluation of __Moroccan Arabic Wikipedia__ Word Embedding Models (With No Bots):

In [5]:
import warnings
import pandas as pd
from pathlib import Path
from embeddings_utils import *
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

w2v_models = [str(x) for x in Path('Word-Embeddings/Word2Vec-Models/ary/').glob('arywiki_20230101_w2v_*_nobots.model')]
glove_models = [str(x) for x in Path('Word-Embeddings/GloVe-Models/ary/').glob('arywiki*_nobots_glove.vectors')]
fasttext_models = [str(x) for x in Path('Word-Embeddings/fastText-Models/ary/').glob('arywiki_20230101_fasttext_*_nobots.model')]

models = w2v_models + fasttext_models + glove_models

files = [str(x) for x in Path('.').glob('ASAD/Arab_States_Analogy_Dataset_All.txt')]

print(f"## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH NO BOTS):")

top_ks=[1, 5, 10]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")

        if 'glove' in model_name: model = load_model(model, 'glove')
        else: model = load_model(model)

        if 'glove' in model_name:
            model_params = f'<vocab_size={format(len(model.key_to_index)-1, ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")
        else:
            model_params = f'<vocab_size={format(len(model.wv.key_to_index), ",d")}, vector_size={model.vector_size}, top_k={top_k}>'
            print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            analogy_test_set = pd.read_csv(file, sep=" ", header=None)
            analogy_test_set.columns = ["example1", "example2", "query", "answer"]

            if 'glove' in model_name:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, model_format='glove', top_k=top_k, axis=1)
            else:
                analogy_test_set['pred_answer'] = analogy_test_set.apply(get_analogy_by_row, model=model, top_k=top_k, axis=1)

            analogy_test_set['is_accurate'] = analogy_test_set.apply(lambda row: 1 if row.answer in row.pred_answer else 0, axis=1)
            accuracy = analogy_test_set['is_accurate'].sum()/len(analogy_test_set) * 100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arywiki_wem_results_nobots.csv', index=False) 

## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA WORD EMBEDDING MODELS (WITH NO BOTS):
   @ MODEL: arywiki_20230101_w2v_skipgram_nobots.model
   @ PARAMS: <vocab_size=71,704, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 2.11%
   @ MODEL: arywiki_20230101_w2v_cbow_nobots.model
   @ PARAMS: <vocab_size=71,704, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 1.84%
   @ MODEL: arywiki_20230101_fasttext_skipgram_nobots.model
   @ PARAMS: <vocab_size=71,704, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 3.62%
   @ MODEL: arywiki_20230101_fasttext_cbow_nobots.model
   @ PARAMS: <vocab_size=71,704, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       > ACCURACY: 1.97%
   @ MODEL: arywiki_20230101_nobots_glove.vectors
   @ PARAMS: <vocab_size=71,704, vector_size=300, top_k=1>
     * FILE: Arab_States_Analogy_Dataset_All.txt
       