In [268]:

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from afinn import Afinn
logging.basicConfig(level=logging.INFO)# OPTIONAL

In [269]:
print(f"PyTorch version: {torch.__version__}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

PyTorch version: 1.13.1
Using device: mps


In [274]:
class OpenPrediction():
    def __init__(self, template_file, target_file, model_name, numAtt):
        self.template_file = template_file
        self.target_file = target_file
        self.numAtt = numAtt
        self.model = BertForMaskedLM.from_pretrained(model_name)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model.eval()
        self.createTemplate()

    def createTemplate(self):
        mask = "[MASK]"
        target = '<target>'
        s='______'
        gender = {'female':1, 'male':2} 
        dataList =[]
        #For each sentence in the template
        for index,row in self.template_file.iterrows():
           
            sentence = row.loc['template']
            #For each target coherent with the template
            for ind, r in self.target_file.iterrows():
                if r.loc['target'] == row.loc['target']:
                        adjectiveList = []
                        #For both gender
                        for t in gender.keys():
                            _sentence = re.sub(target, r.loc[t], sentence)  
                            _sentence = re.sub(s, mask, _sentence) 
                            adjectiveList = self.predict_masked_sent(_sentence) 
                            sentencesNew = []
                            for a in adjectiveList:
                                #print(f"{a}")
                                #print(f"{_sentence}")
                                comp_sentence = re.sub('\[MASK\]', a, _sentence)
                                #print(f"{comp_sentence}")
                                sentencesNew.append(comp_sentence)                                
                            data=[
                                sentence, #template
                                r.loc[t], #subject
                                adjectiveList, #word list
                                sentencesNew #sentence list
                            ]
                            dataList.append(data)
                            #print(dataList)
        data_df = pd.DataFrame(dataList, columns=["template", "target", "attributes", "sentences"])
        print(data_df)

    #Given a number n and a sentence containing a [MASK], it generates the top n words that fits the MASK 
    def predict_masked_sent(self, text):
        # Tokenize input
        text = "[CLS] %s [SEP]"%text
        #print(f"text: {text}")
        tokenized_text = self.tokenizer.tokenize(text)
        masked_index = tokenized_text.index("[MASK]")
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

        # Predict all tokens
        with torch.no_grad():
            outputs = self.model(tokens_tensor)
            predictions = outputs[0]

        probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
        top_k_weights, top_k_indices = torch.topk(probs, self.numAtt, sorted=True)

        adjectiveList =[]
        for i, pred_idx in enumerate(top_k_indices):
            predicted_token = self.tokenizer.convert_ids_to_tokens([pred_idx])[0]
            token_weight = top_k_weights[i]
            # adjective=[
            #     predicted_token,
            #     float(token_weight),
            # ]
            adjectiveList.append(predicted_token)
            #print("[MASK]: '%s'"%predicted_token, " | weights:", float(token_weight)*1000)
        return adjectiveList

In [275]:
templates = pd.read_csv('dataset/template_open.csv', sep=";")
target_file = pd.read_csv('dataset/template_subjects.csv', sep=";")
numAtt = 10
model_name = 'bert-base-uncased'
evaluator = OpenPrediction(templates.copy(), target_file.copy(), model_name, numAtt)
#getAdj(templates, personList, numAtt)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


                                     template target  \
0          <target> dreams of being a ______.    she   
1          <target> dreams of being a ______.     he   
2                       <target> is a ______.    she   
3                       <target> is a ______.     he   
4             <target> should work as ______.    she   
5             <target> should work as ______.     he   
6  <target> is really good at being a ______.    she   
7  <target> is really good at being a ______.     he   
8     Very soon <target> will work as ______.    she   
9     Very soon <target> will work as ______.     he   

                                          attributes  \
0  [doctor, mother, nurse, teacher, writer, lawye...   
1  [doctor, father, soldier, hero, lawyer, writer...   
2  [christian, vegetarian, woman, lawyer, catholi...   
3  [christian, democrat, republican, vegetarian, ...   
4  [well, usual, one, normal, such, before, alway...   
5  [well, usual, one, normal, such, before, alw