# __Fill-Mask Evaluations of Language Modeling Task__

## # Evaluation of __Arabic Wikipedia__ Masked Language Model (With Bots):

In [1]:
import pandas as pd
from mlm_utils import *
from pathlib import Path
import os, logging, warnings
from transformers import pipeline
from transformers import logging as hflogging
from huggingface_hub.utils import disable_progress_bars

disable_progress_bars()
hflogging.set_verbosity_error()
logging.disable(logging.WARNING)
os.environ['TOKENIZERS_PARALLELISM'] = 'False'
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

models = ['SaiedAlshahrani/arwiki_20230101_roberta_mlm_bots']

files = [str(x) for x in Path('.').glob('MASD/Masked_Arab_States_Dataset_All.csv')]

print(f"## EVALUATION OF ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH BOTS):")

top_ks=[10, 50, 100]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")
        
        model_params = f'<vocab_size={format(get_model_config(model)["vocab_size"],",d")}, vector_size={get_model_config(model)["max_position_embeddings"]}, top_k={top_k}>'
        print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            masked_test_set = pd.read_csv(file)
            masked_test_set['pred_masked_tokens'] = masked_test_set.apply(lambda col: mask_filler(col['mlm_prompt'], model, top_k), axis=1)
            masked_test_set['is_accurate'] = masked_test_set.apply(lambda row: 1 if row.masked_token in row.pred_masked_tokens else 0, axis=1)
            accuracy = (masked_test_set['is_accurate'].sum()/masked_test_set.shape[0])*100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arwiki_mlm_results_bots.csv', index=False) 

## EVALUATION OF ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH BOTS):
   @ MODEL: arwiki_20230101_roberta_mlm_bots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=10>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 43.12%
   @ MODEL: arwiki_20230101_roberta_mlm_bots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=50>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 45.0%
   @ MODEL: arwiki_20230101_roberta_mlm_bots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=100>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 50.62%


## # Evaluation of __Arabic Wikipedia__ Masked Language Model (With No Bots):

In [2]:
import pandas as pd
from mlm_utils import *
from pathlib import Path
import os, logging, warnings
from transformers import pipeline
from transformers import logging as hflogging
from huggingface_hub.utils import disable_progress_bars

disable_progress_bars()
hflogging.set_verbosity_error()
logging.disable(logging.WARNING)
os.environ['TOKENIZERS_PARALLELISM'] = 'False'
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

models = ['SaiedAlshahrani/arwiki_20230101_roberta_mlm_nobots']

files = [str(x) for x in Path('.').glob('MASD/Masked_Arab_States_Dataset_All.csv')]

print(f"## EVALUATION OF ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH NO BOTS):")

top_ks=[10, 50, 100]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")
        
        model_params = f'<vocab_size={format(get_model_config(model)["vocab_size"],",d")}, vector_size={get_model_config(model)["max_position_embeddings"]}, top_k={top_k}>'
        print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            masked_test_set = pd.read_csv(file)
            masked_test_set['pred_masked_tokens'] = masked_test_set.apply(lambda col: mask_filler(col['mlm_prompt'], model, top_k), axis=1)
            masked_test_set['is_accurate'] = masked_test_set.apply(lambda row: 1 if row.masked_token in row.pred_masked_tokens else 0, axis=1)
            accuracy = (masked_test_set['is_accurate'].sum()/masked_test_set.shape[0])*100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arwiki_mlm_results_nobots.csv', index=False) 

## EVALUATION OF ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH NO BOTS):
   @ MODEL: arwiki_20230101_roberta_mlm_nobots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=10>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 45.62%
   @ MODEL: arwiki_20230101_roberta_mlm_nobots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=50>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 51.25%
   @ MODEL: arwiki_20230101_roberta_mlm_nobots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=100>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 53.12%


## # Evaluation of __Egyptian Arabic Wikipedia__ Masked Language Model (_only_ With Bots):
We dropped the Egyptian Arabic Wikipedia (_from the evaluation with no bots_) due to having an insignificant number of bot-generated articles, only 15 articles.

In [3]:
import pandas as pd
from mlm_utils import *
from pathlib import Path
import os, logging, warnings
from transformers import pipeline
from transformers import logging as hflogging
from huggingface_hub.utils import disable_progress_bars

disable_progress_bars()
hflogging.set_verbosity_error()
logging.disable(logging.WARNING)
os.environ['TOKENIZERS_PARALLELISM'] = 'False'
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

models = ['SaiedAlshahrani/arzwiki_20230101_roberta_mlm']

files = [str(x) for x in Path('.').glob('MASD/Masked_Arab_States_Dataset_All.csv')]

print(f"## EVALUATION OF EGYPTIAN ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (ONLY WITH BOTS):")

top_ks=[10, 50, 100]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")
        
        model_params = f'<vocab_size={format(get_model_config(model)["vocab_size"],",d")}, vector_size={get_model_config(model)["max_position_embeddings"]}, top_k={top_k}>'
        print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            masked_test_set = pd.read_csv(file)
            masked_test_set['pred_masked_tokens'] = masked_test_set.apply(lambda col: mask_filler(col['mlm_prompt'], model, top_k), axis=1)
            masked_test_set['is_accurate'] = masked_test_set.apply(lambda row: 1 if row.masked_token in row.pred_masked_tokens else 0, axis=1)
            accuracy = (masked_test_set['is_accurate'].sum()/masked_test_set.shape[0])*100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arzwiki_mlm_results_bots.csv', index=False) 

## EVALUATION OF EGYPTIAN ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (ONLY WITH BOTS):
   @ MODEL: arzwiki_20230101_roberta_mlm
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=10>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 8.12%
   @ MODEL: arzwiki_20230101_roberta_mlm
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=50>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 25.62%
   @ MODEL: arzwiki_20230101_roberta_mlm
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=100>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 35.0%


## # Evaluation of __Moroccan Arabic Wikipedia__ Masked Language Model (With Bots):

In [4]:
import pandas as pd
from mlm_utils import *
from pathlib import Path
import os, logging, warnings
from transformers import pipeline
from transformers import logging as hflogging
from huggingface_hub.utils import disable_progress_bars

disable_progress_bars()
hflogging.set_verbosity_error()
logging.disable(logging.WARNING)
os.environ['TOKENIZERS_PARALLELISM'] = 'False'
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

models = ['SaiedAlshahrani/arywiki_20230101_roberta_mlm_bots']

files = [str(x) for x in Path('.').glob('MASD/Masked_Arab_States_Dataset_All.csv')]

print(f"## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH BOTS):")

top_ks=[10, 50, 100]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")
        
        model_params = f'<vocab_size={format(get_model_config(model)["vocab_size"],",d")}, vector_size={get_model_config(model)["max_position_embeddings"]}, top_k={top_k}>'
        print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            masked_test_set = pd.read_csv(file)
            masked_test_set['pred_masked_tokens'] = masked_test_set.apply(lambda col: mask_filler(col['mlm_prompt'], model, top_k), axis=1)
            masked_test_set['is_accurate'] = masked_test_set.apply(lambda row: 1 if row.masked_token in row.pred_masked_tokens else 0, axis=1)
            accuracy = (masked_test_set['is_accurate'].sum()/masked_test_set.shape[0])*100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arywiki_mlm_results_bots.csv', index=False) 

## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH BOTS):
   @ MODEL: arywiki_20230101_roberta_mlm_bots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=10>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 0.0%
   @ MODEL: arywiki_20230101_roberta_mlm_bots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=50>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 0.0%
   @ MODEL: arywiki_20230101_roberta_mlm_bots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=100>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 0.62%


## # Evaluation of __Moroccan Arabic Wikipedia__ Masked Language Model (With No Bots):

In [5]:
import pandas as pd
from mlm_utils import *
from pathlib import Path
import os, logging, warnings
from transformers import pipeline
from transformers import logging as hflogging
from huggingface_hub.utils import disable_progress_bars

disable_progress_bars()
hflogging.set_verbosity_error()
logging.disable(logging.WARNING)
os.environ['TOKENIZERS_PARALLELISM'] = 'False'
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

models = ['SaiedAlshahrani/arywiki_20230101_roberta_mlm_nobots']

files = [str(x) for x in Path('.').glob('MASD/Masked_Arab_States_Dataset_All.csv')]

print(f"## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH NO BOTS):")

top_ks=[10, 50, 100]
dataframes = []

for top_k in top_ks:
    for model in models:
        model_name = str(model.split('/')[-1])
        print(f"   @ MODEL: {model_name}")
        
        model_params = f'<vocab_size={format(get_model_config(model)["vocab_size"],",d")}, vector_size={get_model_config(model)["max_position_embeddings"]}, top_k={top_k}>'
        print(f"   @ PARAMS: {model_params}")

        for file in files:
            dataframe = []
            dataframe.append(model_name)
            dataframe.append(model_params)

            masked_test_set = pd.read_csv(file)
            masked_test_set['pred_masked_tokens'] = masked_test_set.apply(lambda col: mask_filler(col['mlm_prompt'], model, top_k), axis=1)
            masked_test_set['is_accurate'] = masked_test_set.apply(lambda row: 1 if row.masked_token in row.pred_masked_tokens else 0, axis=1)
            accuracy = (masked_test_set['is_accurate'].sum()/masked_test_set.shape[0])*100

            print(f"     * FILE: {file.split('/')[1]}")
            dataframe.append(file)

            print(f'       > ACCURACY: {round(accuracy, 2)}%')
            dataframe.append(round(accuracy, 2))

            dataframes.append(dataframe)

dataframes = pd.DataFrame(dataframes)
dataframes = dataframes.rename(columns={0: 'Model', 1: 'Params',  2: 'File', 3: 'Accuracy'})
dataframes.to_csv('arywiki_mlm_results_nobots.csv', index=False) 

## EVALUATION OF MOROCCAN ARABIC WIKIPEDIA MASKED LANGUAGE MODEL (WITH NO BOTS):
   @ MODEL: arywiki_20230101_roberta_mlm_nobots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=10>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 0.0%
   @ MODEL: arywiki_20230101_roberta_mlm_nobots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=50>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 0.0%
   @ MODEL: arywiki_20230101_roberta_mlm_nobots
   @ PARAMS: <vocab_size=52,000, vector_size=514, top_k=100>
     * FILE: Masked_Arab_States_Dataset_All.csv
       > ACCURACY: 0.62%
