## Load original dataset

In [20]:
from dotenv import load_dotenv
import os
from pprint import pprint

import pandas
import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from openai import OpenAI
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_subset = "test"
dataset = load_dataset("openai/gsm8k", "main")[selected_subset]
modified_dataset_name = "GSM8k-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"
load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8085/v1"
)
model = "meta-llama/Llama-3.1-8B-Instruct"
df = pd.DataFrame(dataset)
df

Unnamed: 0,question,answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t..."
...,...,...
1314,John had a son James when he was 19. James is...,Dora is 12-3=<<12-3=9>>9\nSo James is 9*2=<<9*...
1315,There are some oranges in a basket. Ana spends...,There are 60 minutes in an hour. Ana peels an ...
1316,Mark's car breaks down and he needs to get a n...,The discount on the radiator was 400*.8=$<<400...
1317,"Farmer Brown has 20 animals on his farm, all e...",Let C be the number of chickens.\nThere are 20...


In [3]:
dataset_dict = DatasetDict()
dataset_dict["main"] = Dataset.from_pandas(df)
example_question = "How can one differentiate a set and a dictionary in python?"
dataset_dict

DatasetDict({
    main: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [4]:
def upload():
    for subset in dataset_dict.keys():
        print("Uploading", subset)
        dataset_dict[subset].push_to_hub(repo_id, subset)

## Paraphrase Type: Naive Addition

In [5]:
def add_sentence_to_question(question: str) -> str:
    return question + " " + "Sebastian goes to buy icecream."


print("Rows mentioning 'Sebastian':", len(df[df['question'].str.contains("Sebastian")]))
print("Rows mentioning 'icecream':", len(df[df['question'].str.contains("icecream")]))
add_sentence_to_question(example_question)

Rows mentioning 'Sebastian': 0
Rows mentioning 'icecream': 0


'How can one differentiate a set and a dictionary in python? Sebastian goes to buy icecream.'

In [6]:
def add_naive_sentence(row):
    row["question"] = add_sentence_to_question(row["question"])
    return row

df_naive = pd.DataFrame(dataset_dict["main"])
df_naive = df_naive.progress_apply(add_naive_sentence, axis=1)
dataset_naive = Dataset.from_pandas(df_naive)
dataset_dict["main_naive"] = dataset_naive
pprint(list(dataset_dict.keys()))

100%|██████████| 1319/1319 [00:00<00:00, 26182.02it/s]

['main', 'main_naive']





In [7]:
upload()

Uploading main


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading main_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/327 [00:00<?, ?B/s]

## Paraphrase Type: Addition

In [15]:
preprompt = "Give me one additional sentence to this question that has no direct effect on the question. The sentence should be about a similar topic. It should not be a question. Example additional sentences might focus on unnecessary details, add unnecessary information about other fields of study. Only output exactly the additional sentence and nothing else, the output will be copy/pasted as is. It is extremely important that the sentence does not effect the answer to the question."

addition_prompt = lambda question: str(preprompt + f"\nQuestion: \n'{question}'\nAdditional sentence:")

def add_additional_information(narrative: str) -> str:
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": addition_prompt(narrative),
            }
        ],
        model=model,
    )
    result = response.choices[0].message.content
    return narrative + " " + result

add_additional_information(example_question)

'How can one differentiate a set and a dictionary in python? In computer science, the concept of data structures is closely related to theoretical computer science, which studies the fundamental properties and behaviors of computation itself.'

In [17]:
def add_additional_sentence(row):
    row["question"] = add_additional_information(row["question"])
    return row


df_additional = pd.DataFrame(dataset_dict["main"])
df_additional = df_additional.progress_apply(add_additional_sentence, axis=1)
dataset_additional = Dataset.from_pandas(df_additional)
dataset_dict["main_addition"] = dataset_additional
pprint(list(dataset_dict.keys()))

100%|██████████| 1319/1319 [07:55<00:00,  2.77it/s]

['main', 'main_naive', 'main_addition']





In [18]:
upload()

Uploading main


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/641 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

## Paraphrase Type: Lexicon-Changes


In [21]:
from concurrent.futures import ThreadPoolExecutor

def fetch_alternative(blanked, original_word):
    prompt = f"Output exactly one word that fits where the placeholder '<BLANK>' is placed and has similar meaning to '{original_word}'. No other output besides the word.\nText: '{''.join(blanked)}'"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=5,
        top_logprobs=5,
        logprobs=True,
        temperature=0.0,
    )
    choice = response.choices[0]
    if choice.logprobs.content[0].logprob >= -1:
        return choice.message.content.lower().strip().replace(".", "")
    return original_word


def paraphrase_with_second_best_threading(text):
    doc = nlp(text)
    words = [token.text_with_ws for token in doc]
    pos_tags = [token.pos_ for token in doc]

    with ThreadPoolExecutor() as executor:
        futures = []
        for i, pos in enumerate(pos_tags):
            if pos == "ADJ":
                blanked = words.copy()
                blanked[i] = "<BLANK>"
                future = executor.submit(fetch_alternative, blanked, words[i])
                futures.append((i, future))

        for i, future in futures:
            alt = future.result()
            if alt != words[i]:
                words[i] = alt + doc[i].whitespace_

    return "".join(words)


story = "Once upon a time, there was a brave knight who fought dragons."

# Get the paraphrased story
paraphrased_story = paraphrase_with_second_best_threading(story)
print("Original Story:", story)
print("Paraphrased Story:", paraphrased_story)


Original Story: Once upon a time, there was a brave knight who fought dragons.
Paraphrased Story: Once upon a time, there was a hero knight who fought dragons.


In [22]:
def paraphrase_single_words(row):
    row["question"] = paraphrase_with_second_best_threading(row["question"])
    return row


df_lex = pd.DataFrame(dataset_dict["main"])
df_lex = df_lex.progress_apply(paraphrase_single_words, axis=1)
dataset_lex = Dataset.from_pandas(df_lex)
dataset_dict["main_lexicon"] = dataset_lex

dataset_dict.keys()

100%|██████████| 1319/1319 [03:10<00:00,  6.94it/s]


dict_keys(['main', 'main_naive', 'main_addition', 'main_lexicon'])

In [23]:
upload()

Uploading main


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/964 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

## Paraphrase Type: Syntax-Changes


In [24]:
def rephrase_sentence(sentence):
    # Parse the sentence
    doc = nlp(sentence)
    proper_nouns = {token.text.lower() for token in doc if token.pos_ == "PROPN" or token.ent_type_}

    # Example: Moving adverbs to the beginning
    words = []
    adverbs = []

    # Split ADV from sentence
    for token in doc:
        word = token.text_with_ws.capitalize() if token.text.lower() in proper_nouns else token.text_with_ws.lower()
        if token.pos_ == "ADV":  # Identify adverbs
            adverbs.append(word)
        else:
            words.append(word)

    reordered = "".join(adverbs + words)
    # Capitalize the first word of the sentence
    if reordered:
        reordered = reordered[0].upper() + reordered[1:]

    return reordered

# Test the function
sentence = "Alice quickly ran to the store in New York."
rephrased = rephrase_sentence(sentence)
print("Original:", sentence)
print("Rephrased:", rephrased)

Original: Alice quickly ran to the store in New York.
Rephrased: Quickly Alice ran to the store in New York.


In [25]:
# Example paragraph
paragraph = """Because it was raining, she stayed indoors. The weather had been unpredictable all week.
She decided to read a book, hoping the rain would stop. By evening, the sun came out.
"""

def split_paragraph_into_sentences(paragraph):
    doc = nlp(paragraph)
    sentences = [sent.text for sent in doc.sents]
    return sentences

# Split paragraph into sentences
sentences = split_paragraph_into_sentences(paragraph)

# Print the sentences
for sentence in sentences:
    print("#",sentence)


# Because it was raining, she stayed indoors.
# The weather had been unpredictable all week.

# She decided to read a book, hoping the rain would stop.
# By evening, the sun came out.



In [26]:
def rephrase_narrative(row):
    row["question"] = "".join([rephrase_sentence(sentence) for sentence in split_paragraph_into_sentences(row["question"])])
    return row


df_syn = pd.DataFrame(dataset_dict["main"])
df_syn = df_syn.progress_apply(rephrase_narrative, axis=1)
dataset_syn = Dataset.from_pandas(df_syn)
dataset_dict["main_syntax"] = dataset_syn

dataset_dict.keys()

100%|██████████| 1319/1319 [00:15<00:00, 84.04it/s]


dict_keys(['main', 'main_naive', 'main_addition', 'main_lexicon', 'main_syntax'])

In [31]:
pprint(dataset_dict[("main")][0]["question"])
pprint(dataset_dict[("main_syntax")][0]["question"])

('Janet’s ducks lay 16 eggs per day. She eats three for breakfast every '
 'morning and bakes muffins for her friends every day with four. She sells the '
 "remainder at the farmers' market daily for $2 per fresh duck egg. How much "
 "in dollars does she make every day at the farmers' market?")
('Janet’s ducks lay 16 eggs per day.She eats Three for breakfast Every Morning '
 'and bakes muffins for her friends Every day with Four.Daily she sells the '
 "remainder at the farmers' market for $2 per fresh duck egg.How much in "
 "dollars does she make every day at the farmers' market?")


In [27]:
upload()

Uploading main


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_syntax


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

## Paraphrase Type: Deletion


not applicable