In [3]:
from parrot import Parrot
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline,T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
from numpy import average, round, nan
import re
import warnings
warnings.filterwarnings("ignore")

import spacy
nlp = spacy.load("en_core_web_md")

### Chamando os modelos

In [4]:
#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=False)

device = "cuda"

tokenizer_gpt = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model_gpt = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")#.to(device)


tokenizer_t5 = T5Tokenizer.from_pretrained("t5-base")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-base")

# Translator
translator_en_fr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr", max_length=512)
translator_fr_en = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-fr-en", max_length=512)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Definindo parametros para o T5

In [5]:
def paraphrase(
    question,
    num_beams=4,
    num_beam_groups=4,
    num_return_sequences=4,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.5,
    max_length=128
):
    input_ids = tokenizer_gpt(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids#.to(device)
    
    outputs = model_gpt.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer_gpt.batch_decode(outputs, skip_special_tokens=True)

    return res

#### Criando Funções para gerar parafrases

In [6]:
def generate_output_phrases_t5(df, input_column, output_column):
    """
    Generate output phrases by paraphrasing the input phrases.

    Parameters:
    - df (pandas.DataFrame): The input DataFrame containing the input phrases.
    - input_column (str): The name of the column in the DataFrame that contains the input phrases.
    - output_column (str): The name of the column in the DataFrame to store the output phrases.
    - paraphrase (function): The function used to paraphrase the input phrases.

    Returns:
    - df (pandas.DataFrame): The DataFrame with the original input phrases and the generated output phrases.
    """
    
    # Create an empty list to store data
    data = []

    # Iterate over each input phrase
    for sentence in df[input_column]:
        para_phrases = paraphrase(sentence)
        data.append({input_column: sentence, output_column: para_phrases})

    # Convert list of dictionaries to DataFrame
    new_df = pd.DataFrame(data)

    # Merge new_df with the original DataFrame df
    outro_df = pd.concat([df, new_df[output_column]], axis=1)
    outro_df = pd.concat([outro_df, pd.DataFrame(outro_df[output_column].values.tolist())], axis=1)
    return outro_df

def generate_output_phrases_parrot(df, input_column, output_column):
    """
    Generate output phrases by paraphrasing the input phrases.

    Parameters:
    - df (pandas.DataFrame): The input DataFrame containing the input phrases.
    - input_column (str): The name of the column in the DataFrame that contains the input phrases.
    - output_column (str): The name of the column in the DataFrame to store the output phrases.
    - paraphrase (function): The function used to paraphrase the input phrases.

    Returns:
    - df (pandas.DataFrame): The DataFrame with the original input phrases and the generated output phrases.
    """
    
    # Create an empty list to store data
    data = []
    x = -1
    # Iterate over each input phrase
    for sentence in df[input_column]:
        para_phrases = parrot.augment(input_phrase=sentence, 
                                      max_return_phrases = 3)
        if para_phrases == None:
           data.append({input_column: sentence, output_column: para_phrases})
        else:
            para_frases = list(list(zip(*para_phrases))[0])
            data.append({input_column: sentence, output_column: para_frases})

    # Convert list of dictionaries to DataFrame
    new_df = pd.DataFrame(data)

    # Merge new_df with the original DataFrame df
    outro_df = pd.concat([df, new_df[output_column]], axis=1)

    # Verificar se há valores nulos na coluna 'Parrot_paraphrased_filled_sentence'
    has_null_values = outro_df[output_column].isnull().any()

    # Se houver valores nulos, você pode escolher lidar com eles de várias maneiras
    if has_null_values:
        # Ou substituir os valores nulos por uma string vazia, por exemplo
        outro_df[output_column].fillna('', inplace=True)
    outro_df = pd.concat([outro_df, pd.DataFrame(outro_df[output_column].values.tolist())], axis=1)
    return outro_df

def rename_numeric_columns(df, replacement_prefix='Column_'):
    """
    Renomeia as colunas que consistem apenas de valores numéricos.

    Parâmetros:
    - df (pandas.DataFrame): O DataFrame que contém as colunas a serem renomeadas.
    - replacement_prefix (str): O prefixo a ser usado para os novos nomes das colunas.

    Retorna:
    - df (pandas.DataFrame): O DataFrame com as colunas renomeadas.
    """

    new_columns = []
    for col in df.columns:
        # Verificar se o nome da coluna pode ser convertido para um número inteiro
        try:
            col_int = int(col)
            new_columns.append(replacement_prefix + str(col_int))
        except ValueError:
            new_columns.append(col)

    # Renomear as colunas do DataFrame
    df.columns = new_columns

    return df

def replace_masked_sentence(df, mask_column, label_column, new_column):
    df[new_column] = df.apply(lambda row: str(row[mask_column]).replace("[MASK]", str(row[label_column])), axis=1)
    return df

def substituir_palavras(texto, palavras_substituir):
    if isinstance(texto, str) and texto is not None:
        for palavra in palavras_substituir:
            # Usando expressão regular para substituir todas as ocorrências da palavra, ignorando maiúsculas e minúsculas
            texto = re.sub(r'\b' + re.escape(palavra) + r'\b', "[MASK]", texto, flags=re.IGNORECASE)
    return texto if isinstance(texto, str) else None

### Tratamento de Dados

#### Born In

In [24]:
file = 'D:/tese/code/data/novos_data/filtrado_one_born_in.csv'
data_bornIn = pd.read_csv(file).drop(columns=['Unnamed: 0'])
data_bornIn = data_bornIn[:50]
data_bornIn = replace_masked_sentence(data_bornIn, 'masked_sentence', 'obj_label', 
                               'filled_sentence')
data_bornIn['obj_label'].str.strip()
data_bornIn = data_bornIn[['sub_label', 'template', 'obj_label', 
                           'masked_sentence', 'filled_sentence']]

parafrase_t5_bornIn = generate_output_phrases_t5(data_bornIn, 'filled_sentence', 'T5_paraphrased_filled_sentence')
parafrase_t5_bornIn = rename_numeric_columns(parafrase_t5_bornIn, 'T5_paraphrased_filled_sentence_')
parafrase_t5_bornIn.drop(columns = ['filled_sentence', 'T5_paraphrased_filled_sentence'], inplace=True, axis=1)

parafrase_parrot_bornIn = generate_output_phrases_parrot(data_bornIn, 'filled_sentence', 'Parrot_paraphrased_filled_sentence')
parafrase_parrot_bornIn = rename_numeric_columns(parafrase_parrot_bornIn, 'Parrot_paraphrased_filled_sentence_')
parafrase_parrot_bornIn.drop(columns = ['filled_sentence','Parrot_paraphrased_filled_sentence'], inplace=True, axis=1)

In [38]:
colunas = [coluna for coluna in parafrase_t5_bornIn.columns if re.match(r'T5_paraphrased_filled_sentence_[^_]*\d+', coluna)]
# Agora, use o tamanho da lista para determinar quantas colunas você precisa processar
for coluna in colunas:
    parafrase_t5_bornIn[coluna] = parafrase_t5_bornIn[coluna].apply(lambda x: substituir_palavras(x, parafrase_t5_bornIn["obj_label"]))

colunas = [coluna for coluna in parafrase_parrot_bornIn.columns if re.match(r'Parrot_paraphrased_filled_sentence_[^_]*\d+', coluna)]
# Agora, use o tamanho da lista para determinar quantas colunas você precisa processar
for coluna in colunas:
    parafrase_parrot_bornIn[coluna] = parafrase_parrot_bornIn[coluna].apply(lambda x: substituir_palavras(x, parafrase_parrot_bornIn["obj_label"]))


In [48]:
def deixa_uma_mask(sentence, word):
    # Substitui a palavra na frase por '[MASK]'
    if sentence is None:
        return None
    elif sentence.count('[MASK]') == 1 or sentence.count('[MASK]') == 0:
        return sentence
    else:
        return sentence.replace('[MASK]', word, (sentence.count('[MASK]'))-1)

In [52]:
for i in parafrase_t5_bornIn.columns[3:]:
    parafrase_t5_bornIn[i] = parafrase_t5_bornIn.apply(lambda row: deixa_uma_mask(row[i], 
                                         row['obj_label']), axis=1)
    
for i in parafrase_parrot_bornIn.columns[3:]:
    parafrase_parrot_bornIn[i] = parafrase_parrot_bornIn.apply(lambda row: deixa_uma_mask(row[i], 
                                         row['obj_label']), axis=1)

In [53]:
## Born In
parafrase_t5_bornIn["sentence_length"]= parafrase_t5_bornIn["masked_sentence"].str.len()
parafrase_t5_bornIn = parafrase_t5_bornIn.query('sentence_length < 513')
parafrase_t5_bornIn.drop(columns=['sentence_length'], inplace=True)

parafrase_parrot_bornIn["sentence_length"]= parafrase_parrot_bornIn["masked_sentence"].str.len()
parafrase_parrot_bornIn = parafrase_parrot_bornIn.query('sentence_length < 513')
parafrase_parrot_bornIn.drop(columns=['sentence_length'], inplace=True)

In [54]:
parafrase_parrot_bornIn

Unnamed: 0,sub_label,template,obj_label,masked_sentence,Parrot_paraphrased_filled_sentence_0,Parrot_paraphrased_filled_sentence_1,Parrot_paraphrased_filled_sentence_2
0,Allan Peiper,[X] was born in [Y] .,Alexandra,"Allan Peiper (born 26 April 1960 in [MASK], Au...",allan peiper born april 26 1960 in [MASK] aust...,allan peiper born april 26 1960 in [MASK] aust...,
1,Anthony Barber,[X] was born in [Y] .,Doncaster,It was won by the Conservative candidate Antho...,it was won by the conservative candidate antho...,it was won by conservative candidate anthony b...,it was won by conservative candidate anthony b...
2,Paul Mounsey,[X] was born in [Y] .,Scotland,NahooToo is the second album by [MASK] musicia...,NahooToo is the second album by [MASK] musicia...,,
3,Moe Koffman,[X] was born in [Y] .,Toronto,He has performed with many of [MASK]'s foremos...,he has performed with many of [MASK]'s finest ...,he has performed with many of [MASK]'s foremos...,he has performed with many of [MASK]'s foremos...
4,Kurt Schwertsik,[X] was born in [Y] .,Vienna,"Kurt Schwertsik (born 25 June 1935, [MASK]) is...",kurt schwertsik born june 25 1935 in [MASK] is...,,
5,Claude Arrieu,[X] was born in [Y] .,Paris,"Claude Arrieu (born [MASK], November 30, 1903 ...","Claude Arrieu (born Paris, November 30, 1903 -...",,
6,Ryō Kase,[X] was born in [Y] .,Yokohama,"Ryo Kase (加瀬 亮 Kase Ryō, born November 9, 1974...","Ryo Kase (加瀬 亮 Kase Ryō, born November 9, 1974...",,
7,Frans Floris I,[X] was born in [Y] .,Antwerp,"1228 – [MASK], March 26, 1258) ""the guardian""[...","1228 – [MASK], March 26, 1258) ""the guardian""[...",,
8,Henry Heras,[X] was born in [Y] .,Barcelona,"Henry Heras (11 September 1888, [MASK] - 14 De...","Henry Heras (11 September 1888, [MASK] - 14 De...",,
9,Daniele Franceschini,[X] was born in [Y] .,Rome,Daniele Franceschini (born 13 January 1976 in ...,Daniele Franceschini (born 13 January 1976 in ...,,


### Use the models

In [55]:
def process_prompt_results(df_column, masked_model,):
    """
    Process the results of prompts using a masked language model.

    Parameters:
    df_column (list): A list of prompts to be processed.
    masked_model: The masked language model used for processing.

    Returns:
    list: A list of dictionaries for each prompt, where each dictionary contains 'tokens' and 'score'.
    """
    outputs = []
    for prompt in df_column:
        if prompt is None or prompt.find('[MASK]') == -1:
            outputs.append(nan)
        else:
            outputs.append(dictionary['token_str'].strip().lower() for dictionary in masked_model(prompt))
            #outputs.append(
            #    [dictionary['token_str'].strip().lower() for dictionary in masked_model(prompt)]
            #    )
    return outputs

def process_prompt_results2(df_column, masked_model, with_socre=0):
    """
    Process the results of prompts using a masked language model.

    Parameters:
    df_column (list): A list of prompts to be processed.
    masked_model: The masked language model used for processing.

    Returns:
    list: A list of dictionaries for each prompt, where each dictionary contains 'tokens' and 'score'.
    """
    outputs = []
    for prompt in df_column:
        if prompt is None or prompt.find('[MASK]') == -1:
                outputs.append(nan)
        else:
            outputs.append([{'token':item['token_str'].strip().lower(), 
                             'score':round(item['score'],3)} \
                                for item in masked_model(prompt)])      
    return outputs

def create_results_list(data, true_labels, num_iterations):
    """
    Create a results list based on whether each corresponding true_label is present in the corresponding sublist of the data.

    Parameters:
    - data: List of sublists to search through
    - true_labels: List of true_labels to check for in each sublist
    - num_iterations: Number of iterations to consider for each sublist

    Returns:
    - results_list: List of 1s and 0s based on the presence of true_labels in the sublists

    results_list = []
    for sublist, true_label in zip(data, true_labels):
        results_list.append(1 if true_label in sublist[:num_iterations] else 0)
    """
    results_list = [1 if true_label in sublist[:num_iterations] else 0 for sublist, true_label in zip(data, true_labels)]
    indices = [sublist.index(true_label) if true_label in sublist[:num_iterations] else None for sublist, true_label in zip(data, true_labels)]
    return results_list, indices, round(average(results_list), 3)

def get_first(seq):
    if isinstance(seq, (tuple, list)):
        return get_first(seq[0])
    return seq

def get_zero_list(seq):
    return [get_first(i) for i in seq]

In [56]:
## Born in
answer_filled_sentences_bornIn_t5 = parafrase_t5_bornIn['obj_label'].str.lower().tolist()
answer_filled_sentences_bornIn_parrot = parafrase_parrot_bornIn['obj_label'].str.lower().tolist()

In [57]:
top_k = 15
unmasker_bertLarge = pipeline('fill-mask', model='bert-large-uncased', top_k = top_k)#, top_k=10

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
def processar_entidades(valor):
    if isinstance(valor, str):  # Verifica se o valor é uma string
        item = {'token': valor}
        teste = nlp(item['token']).ents
        if len(teste) == 0:
            return 0
        elif teste[0].label_ == 'GPE':
            return valor  # Retorna o valor original se for um GPE
        else:
            return 0
    else:
        return valor  # Retorna o valor original se não for uma string

#### Born In

In [59]:
outputs_bert_bornIn_original= process_prompt_results2(parafrase_t5_bornIn['masked_sentence'], 
                                                             unmasker_bertLarge)

outputs_bert_bornIn_t5_0= process_prompt_results2(parafrase_t5_bornIn['T5_paraphrased_filled_sentence_0'], 
                                                         unmasker_bertLarge)
outputs_bert_bornIn_t5_1= process_prompt_results2(parafrase_t5_bornIn['T5_paraphrased_filled_sentence_1'], 
                                                         unmasker_bertLarge)
outputs_bert_bornIn_t5_2= process_prompt_results2(parafrase_t5_bornIn['T5_paraphrased_filled_sentence_2'], 
                                                         unmasker_bertLarge)
outputs_bert_bornIn_t5_3= process_prompt_results2(parafrase_t5_bornIn['T5_paraphrased_filled_sentence_3'], 
                                                         unmasker_bertLarge)

In [60]:
outputs_bornIn_t5 = pd.DataFrame({'answers':answer_filled_sentences_bornIn_t5,
                                  'original': outputs_bert_bornIn_original,
                                  't5_0': outputs_bert_bornIn_t5_0,
                                  't5_1': outputs_bert_bornIn_t5_1,
                                  't5_2': outputs_bert_bornIn_t5_2,
                                  't5_3': outputs_bert_bornIn_t5_3})

In [61]:
outputs_bornIn_t5.to_csv('D:/tese/code/data/novos_data/outputs_bornIn_t5.csv')