# Test LLM with a text2speech output

In [1]:
from parrot import Parrot
import pandas as pd
import random
import torch
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline, T5ForConditionalGeneration, T5Tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("./FineTunedParrotParaphraser")
paraphrase_model.to('cuda');
task_prefix = "paraphrase: ";

In [3]:
general_descriptions = pd.read_csv('../data/general_informations.csv')
general_descriptions = general_descriptions['general_informations'].to_list()

In [9]:
sentences = general_descriptions[1].split('.')
sentences = [sentence for sentence in sentences if len(sentence) > 20]

In [10]:
sentences

['This building was a ancillary building to building one',
 ' It was used for religious purposes',
 ' A lot findings from the Geometric to Hellenistic times came to light from this building',
 ' It was dated to the late Classical to early Hellenistic times',
 ' During its excavation, a lot of pottery, figurines, lighting vessels and several animal bones were found',
 ' This area is important among other things because of the presence of many marble sculptures and also inscriptions']

In [13]:
def paraphase_text(text):
    # para_phrases = paraphraser.augment(input_phrase=text, diversity_ranker="levenshtein", do_diverse=False, adequacy_threshold = 0.7, fluency_threshold = 0.7);
    # get 2 instructions from the dataset
    inputs = tokenizer([task_prefix + text], return_tensors="pt", padding=True)
    inputs = inputs.to('cuda')
    preds = paraphrase_model.generate(
              inputs['input_ids'],
              do_sample=False, 
              max_length=256, 
              num_beams = 32,
              num_beam_groups = 4,
              diversity_penalty = 2.0,
              early_stopping=True,
              num_return_sequences=4
              )
    generated_phrases = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    # Remove duplicates
    generated_phrases = list(set(generated_phrases))
    return generated_phrases

In [14]:
for phrase in sentences:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  phrases = paraphase_text(phrase)
  for phrase in phrases:
    print(phrase)

----------------------------------------------------------------------------------------------------
Input_phrase:  This building was a ancillary building to building one
----------------------------------------------------------------------------------------------------
This building was an ancillary building to building one
This building was an ancillary building to the building one
This building was an auxiliary building to building one
This building was an auxiliary building to the building one
----------------------------------------------------------------------------------------------------
Input_phrase:   It was used for religious purposes
----------------------------------------------------------------------------------------------------
It was used to religious purposes
It was used for religious purposes
It was used in religious purposes
It was used for religious reasons
----------------------------------------------------------------------------------------------------
Input