In [None]:
!pip install -U honest

In [1]:
from transformers import AutoTokenizer, AutoModel, AutoModelWithLMHead, AutoModelForMaskedLM
from transformers import pipeline
from honest import honest

In [2]:
def fill_in_and_evaluate_honest(lang, tokenizer, model):
    evaluator = honest.HonestEvaluator(lang)
    masked_templates = evaluator.templates()

    # Define nlp_fill pipeline
    nlp_fill = pipeline('fill-mask', model=model, tokenizer=tokenizer, top_k=1)

    # Fill templates (please check if the filled words contain any special character)
    female_masked_templates = {masked_template: value for (masked_template, value) in masked_templates.items() if value['mf'] == 'female'}
    male_masked_templates = {masked_template: value for (masked_template, value) in masked_templates.items() if value['mf'] == 'male'}
    
    female_filled_templates = [[fill['token_str'].strip() 
                                for fill in nlp_fill(masked_sentence.replace('[M]', tokenizer.mask_token))] 
                                for masked_sentence in female_masked_templates.keys()]
    male_filled_templates = [[fill['token_str'].strip() 
                              for fill in nlp_fill(masked_sentence.replace('[M]', tokenizer.mask_token))] 
                              for masked_sentence in male_masked_templates.keys()]
    
    female_honest_score = evaluator.honest(female_filled_templates)
    male_honest_score = evaluator.honest(male_filled_templates)
    
    print("female honest score: ", female_honest_score)
    print("male honest score: ", male_honest_score)

In [3]:
# Load BERT model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForMaskedLM.from_pretrained("roberta-base")

fill_in_and_evaluate_honest(lang="en", tokenizer=tokenizer, model=model)

  data = pd.read_csv(f'https://raw.githubusercontent.com/MilaNLProc/honest/main/resources/{self.language}_template.tsv', index_col=0, sep='\t').T.to_dict('dict')


female honest score:  0.057803468208092484
male honest score:  0.013227513227513227


In [4]:
fill_in_and_evaluate_honest(lang="es", tokenizer=tokenizer, model=model)

  data = pd.read_csv(f'https://raw.githubusercontent.com/MilaNLProc/honest/main/resources/{self.language}_template.tsv', index_col=0, sep='\t').T.to_dict('dict')


female honest score:  0.02564102564102564
male honest score:  0.08717948717948718


In [5]:
fill_in_and_evaluate_honest(lang="fr", tokenizer=tokenizer, model=model)

  data = pd.read_csv(f'https://raw.githubusercontent.com/MilaNLProc/honest/main/resources/{self.language}_template.tsv', index_col=0, sep='\t').T.to_dict('dict')


female honest score:  0.02284263959390863
male honest score:  0.04871794871794872


In [6]:
fill_in_and_evaluate_honest(lang="it", tokenizer=tokenizer, model=model)

female honest score:  0.024691358024691357
male honest score:  0.0


In [7]:
fill_in_and_evaluate_honest(lang="pt", tokenizer=tokenizer, model=model)

  data = pd.read_csv(f'https://raw.githubusercontent.com/MilaNLProc/honest/main/resources/{self.language}_template.tsv', index_col=0, sep='\t').T.to_dict('dict')


female honest score:  0.0
male honest score:  0.004761904761904762


In [8]:
fill_in_and_evaluate_honest(lang="ro", tokenizer=tokenizer, model=model)

  data = pd.read_csv(f'https://raw.githubusercontent.com/MilaNLProc/honest/main/resources/{self.language}_template.tsv', index_col=0, sep='\t').T.to_dict('dict')


female honest score:  0.0
male honest score:  0.0
