In [3]:
# Include the necessary libraries
import torch
import warnings
import yaml
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [4]:
# suppress warnings
warnings.filterwarnings("ignore")

In [5]:
tokenizer = AutoTokenizer.from_pretrained("./FineTunedParrotParaphraser")
model = AutoModelForSeq2SeqLM.from_pretrained("./FineTunedParrotParaphraser")

In [8]:
# read a yaml file
def read_yaml_file(file_path):
    with open(file_path, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

In [9]:
# clean phrases
def clean_phrases(examples):
    phrases = examples.split('\n')
    # remove empty strings
    phrases = [phrase for phrase in phrases if phrase != '']
    # remove dash and space
    phrases = [phrase[2:] for phrase in phrases]
    return phrases

In [21]:
task_prefix = "paraphrase: "

# create paraphrased folder
if not os.path.exists('../data/nlu/paraphrased'):
    os.makedirs('../data/nlu/paraphrased')
    
# create paraphrased folder
if not os.path.exists('../data/responses/paraphrased'):
    os.makedirs('../data/responses/paraphrased')

# iterate over the nlu files
for file in os.listdir('../data/nlu/needs_paraphrasing'):
    # if file is a yaml file
    if file.endswith('.yml'):
        # print the file name
        print(f'Looking at {file}')
        # read yaml file
        data = read_yaml_file('../data/nlu/needs_paraphrasing/' + file)
        # file name
        file_name = file.split('.')[0]
        if not os.path.exists(f'../data/nlu/paraphrased/{file_name}'):
            os.makedirs(f'../data/nlu/paraphrased/{file_name}')
        # for each intent in nlu
        for row in data['nlu']:
            intent = row['intent']
            examples = row['examples']
            # get phrases
            phrases = clean_phrases(examples)
            initial_len = len(phrases)
            final_phrases = []
            for phrase in phrases:
                phrase = tokenizer([task_prefix + phrase], return_tensors="pt", padding=True)
                para_phrases = model.generate(
                    phrase['input_ids'],
                    do_sample=False, 
                    max_length=1026, 
                    num_beams = 16,
                    num_beam_groups = 4,
                    diversity_penalty = 4.0,
                    early_stopping=True,
                    num_return_sequences=4
                    )
                # final_phrases.append(phrase)
                para_phrases = tokenizer.batch_decode(para_phrases, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                for para_phrase in para_phrases:
                    # if score is bigger than 10 append the paraphrased phrase to the list
                    final_phrases.append(para_phrase)
            final_len = len(final_phrases)
            # print the number of paraphrased phrases
            print(f"Number of paraphrased phrases for {intent}: {final_len - initial_len}")
            # capitalize the first letter of the phrases
            final_phrases = [response[0].capitalize() + response[1:] for response in final_phrases]
            final_phrases = ['- ' + phrase for phrase in final_phrases]
            updated_phrases = '\n'.join(final_phrases)
            # write to data.txt
            with open(f"../data/nlu/paraphrased/{file_name}/{intent}.txt", 'w') as f:
                f.write(updated_phrases)
                
                
# # iterate over the responses files
# for file in os.listdir('../data/responses/needs_paraphrasing'):
#     # if file is a yaml file
#     if file.endswith('.yml'):
#         # print the file name
#         print(f'Looking at {file}')
#         # read yaml file
#         data = read_yaml_file('../data/responses/needs_paraphrasing/' + file)
#         # file name
#         file_name = file.split('.')[0]
#         if not os.path.exists(f'../data/responses/paraphrased/{file_name}'):
#             os.makedirs(f'../data/responses/paraphrased/{file_name}')
#         # for each intent in nlu
#         for row in data['responses']:
#             final_responses = []
#             initial_len = len(data['responses'][row])
#             for sentence in data['responses'][row]:
#                 sentence = sentence['text']
#                 final_responses.append(sentence)
#                 para_phrases = model.augment(input_phrase=sentence)
#                 for para_phrase in para_phrases:
#                     # if score is bigger than 10 append the paraphrased phrase to the list
#                     if para_phrase[1] > 10:
#                         # append the paraphrased phrase to the list
#                         final_responses.append(para_phrase[0])
#             final_len = len(final_responses)
#             # print the number of paraphrased phrases
#             print(f"Number of paraphrased phrases for {row}: {final_len - initial_len}")
#             # capitalize the first letter of the phrases
#             final_responses = [response[0].capitalize() + response[1:] for response in final_responses]
#             # add the - text: part that is needed
#             final_responses = ['- text: ' + response for response in final_responses]
#             updated_responses = '\n'.join(final_responses)
#             # write to data.txt
#             with open(f"../data/responses/paraphrased/{file_name}/{row}.txt", 'w') as f:
#                 f.write(updated_responses)
        

Looking at pebbles_degredations.yml
Number of paraphrased phrases for inform_pebbles_with_cracks: 18
Number of paraphrased phrases for inform_pebbles_with_sulfation: 18
Number of paraphrased phrases for inform_pebbles_with_cavities: 30
Number of paraphrased phrases for inform_pebbles_with_crystal_decomposition: 27
Number of paraphrased phrases for inform_pebbles_with_disintegration: 15
Number of paraphrased phrases for inform_pebbles_with_infection: 69


In [22]:
# delete the needs_paraphrasing files
for file in os.listdir('../data/nlu/needs_paraphrasing'):
    os.remove(f'../data/nlu/needs_paraphrasing/{file}')
    
for file in os.listdir('../data/responses/needs_paraphrasing'):
    os.remove(f'../data/responses/needs_paraphrasing/{file}')