# Loading the model  

In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast
import os

model_name = "tuner007/pegasus_paraphrase"
model_dir = "pegasus_model"

"""# Check if the model is already downloaded
if not os.path.exists(model_dir):
    # Download and save the model
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    model.save_pretrained(model_dir)
else:"""
# Load the model from the saved directory
print("Model from dir")
model = PegasusForConditionalGeneration.from_pretrained(model_dir)
tokenizer_dir = "pegasus_tokenizer"
"""# Check if the tokenizer is already downloaded

if not os.path.exists(tokenizer_dir):
    # Download and save the tokenizer
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
    tokenizer.save_pretrained(tokenizer_dir)
else:"""

print("Tokenizer from dir")
# Load the tokenizer from the saved directory
tokenizer = PegasusTokenizerFast.from_pretrained(tokenizer_dir)




Model from dir
Tokenizer from dir


# Helper Function

In [3]:
def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=5, num_beams=5):
    # tokenize the text to be a list of token IDs
    inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
    # generate the paraphrased sentences
    outputs = model.generate(
        **inputs,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
    )
    # decode the generated sentences using the tokenizer to get them back to text
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Main

In [4]:

import pandas as pd

csv_file_path = "testCSV.csv" 
output_csv_path = "outputFile.csv"
df = pd.read_csv(csv_file_path)


paraphrased_sentences = []
labels = []

for idx, (sentence, label) in enumerate(zip(df.iloc[:, 0], df.iloc[:, 1])):
    paraphrased = get_paraphrased_sentences(model, tokenizer, sentence, num_beams=10, num_return_sequences=5)
    
    #append the original sentence and label
    paraphrased_sentences.append(sentence)
    labels.append(label)
    
    #append the paraphrased sentences and label
    paraphrased_sentences.extend(paraphrased)
    labels.extend([label] * len(paraphrased))

# Create a new DataFrame with the original and paraphrased sentences and labels
output_df = pd.DataFrame({"Sentence": paraphrased_sentences, "Label": labels})

output_df.to_csv(output_csv_path, index=False)
