## Load original dataset

In [2]:
import ast

import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, DownloadMode
from openai import OpenAI
from tqdm import tqdm
import spacy

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

dataset = load_dataset("TAUR-Lab/MuSR")
modified_dataset_name = "MuSR-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

client = OpenAI(
    api_key="asdasdasd",
    base_url="http://134.76.18.30:8081/v1"
)
dataset_dict = DatasetDict(dataset)
example_story = "Once upon a time, there was a brave knight who fought dragons."
dataset

Generating murder_mysteries split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating object_placements split:   0%|          | 0/256 [00:00<?, ? examples/s]

Generating team_allocation split:   0%|          | 0/250 [00:00<?, ? examples/s]

DatasetDict({
    murder_mysteries: Dataset({
        features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice'],
        num_rows: 250
    })
    object_placements: Dataset({
        features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice'],
        num_rows: 256
    })
    team_allocation: Dataset({
        features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice'],
        num_rows: 250
    })
})

In [6]:
def upload():
    for subset in dataset_dict.keys():
        print("Uploading", subset)
        dataset_dict[subset].push_to_hub(repo_id, subset)

## Paraphrase Type: Naive Addition

In [7]:
from pprint import pprint

# Define the sentence to append
naive_sentence = lambda subject: f"{subject} goes to buy some icecream."


def add_sentence_to_question(question: str, subject: str) -> str:
    return question + " " + naive_sentence(subject)

add_sentence_to_question(example_story, "knight")

'Once upon a time, there was a brave knight who fought dragons. knight goes to buy some icecream.'

In [8]:
def add_subject_to_choices(row):
    c: str = ast.literal_eval(row["choices"])[0]
    row["narrative"] = add_sentence_to_question(row["narrative"], c)
    return row

for subset in dataset.keys():
    df_naive = pd.DataFrame(dataset[subset])
    df_naive = df_naive.progress_apply(add_subject_to_choices, axis=1)
    dataset_naive = Dataset.from_pandas(df_naive)
    dataset_dict[subset + "_naive"] = dataset_naive
pprint(list(dataset_dict.keys()))

100%|██████████| 250/250 [00:00<00:00, 17769.16it/s]
100%|██████████| 256/256 [00:00<00:00, 32096.07it/s]
100%|██████████| 250/250 [00:00<00:00, 32451.60it/s]

['murder_mysteries',
 'object_placements',
 'team_allocation',
 'murder_mysteries_naive',
 'object_placements_naive',
 'team_allocation_naive']





In [9]:
upload()

Uploading murder_mysteries


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading object_placements_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/8.00k [00:00<?, ?B/s]

Uploading team_allocation_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/8.00k [00:00<?, ?B/s]

## Paraphrase Type: Addition

In [165]:
preprompt = "Give me one additional sentence to this story that has no direct effect on the storyline. The sentence should include one of the characters. Example additional sentences might focus on unnecessary details, add unnecessary information about one of the characters past or thoughts. Only output exactly only the additional sentence and nothing else, the output will be copy/pasted as is. It is extremely important that the sentence does not effect the storyline."

addition_prompt = lambda story: str(preprompt + f"\nStoryline: \n'{story}'\nAdditional sentence:")

def add_additional_information(narrative: str) -> str:
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": addition_prompt(narrative),
            }
        ],
        model="gpt-4o",
    )
    result = response.choices[0].message.content
    return narrative + " " + result

add_additional_information(example_story)

"Once upon a time, there was a brave knight who fought dragons. Henry, the blacksmith's cat, had taken to sleeping in the knight's favorite leather chair whenever he was away from the castle."

In [166]:
def add_subject_to_choices(row):
    row["narrative"] = add_additional_information(row["narrative"])
    return row

for subset in dataset.keys():
    df_additional = pd.DataFrame(dataset[subset])
    df_additional = df_additional.progress_apply(add_subject_to_choices, axis=1)
    dataset_additional = Dataset.from_pandas(df_additional)
    dataset_dict[subset + "_addition"] = dataset_additional
pprint(list(dataset_dict.keys()))

100%|██████████| 250/250 [02:51<00:00,  1.46it/s]
100%|██████████| 256/256 [02:52<00:00,  1.48it/s]
100%|██████████| 250/250 [02:32<00:00,  1.64it/s]

['murder_mysteries',
 'object_placements',
 'team_allocation',
 'murder_mysteries_naive',
 'object_placements_naive',
 'team_allocation_naive',
 'murder_mysteries_addition',
 'object_placements_addition',
 'team_allocation_addition']





In [167]:
upload()

Uploading murder_mysteries


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading object_placements_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

Uploading team_allocation_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

## Paraphrase Type: Lexicon-Changes


In [173]:
from concurrent.futures import ThreadPoolExecutor


def fetch_alternative(blanked, original_word):
    prompt = f"Output exactly one word that fits where the placeholder '<BLANK>' is placed and has similar meaning to '{original_word}'. No other output besides the word.\nText: '{''.join(blanked)}'"
    response = client.chat.completions.create(
        model="asdasd",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=5,
        top_logprobs=5,
        logprobs=True,
        temperature=0.0,
    )
    choice = response.choices[0]
    if choice.logprobs.content[0].logprob >= -1:
        return choice.message.content.lower().strip().replace(".", "")
    return original_word


def paraphrase_with_second_best_threading(text):
    with ThreadPoolExecutor() as executor:
        doc = nlp(text)
        words = [token.text_with_ws for token in doc]
        pos_tags = [token.pos_ for token in doc]
        futures = []
        for i, pos in enumerate(pos_tags):
            if pos == "ADJ":
                blanked = words.copy()
                blanked[i] = "<BLANK>"
                future = executor.submit(fetch_alternative, blanked, words[i])
                futures.append((i, future))

        for i, future in futures:
            alt = future.result()
            if alt.lower().strip().replace(".", "") != words[i].lower().strip().replace(".", ""):
                #print(words[i], ">", alt + doc[i].whitespace_)
                words[i] = alt + doc[i].whitespace_

    return "".join(words)

paraphrased_story = paraphrase_with_second_best_threading(example_story)
print("Original Story:", example_story)
print("Paraphrased Story:", paraphrased_story)

Original Story: Once upon a time, there was a brave knight who fought dragons.
Paraphrased Story: Once upon a time, there was a hero knight who fought dragons.


In [178]:
def paraphrase_single_words(row):
    row["narrative"] = paraphrase_with_second_best_threading(row["narrative"])
    return row


for subset in dataset.keys():
    df_lex = pd.DataFrame(dataset[subset])
    df_lex = df_lex.progress_apply(paraphrase_single_words, axis=1)
    dataset_lex = Dataset.from_pandas(df_lex)
    dataset_dict[subset + "_lexicon"] = dataset_lex

dataset_dict.keys()

100%|██████████| 250/250 [28:54<00:00,  6.94s/it]
100%|██████████| 256/256 [28:38<00:00,  6.71s/it]
100%|██████████| 250/250 [12:19<00:00,  2.96s/it]


dict_keys(['murder_mysteries', 'object_placements', 'team_allocation', 'murder_mysteries_naive', 'object_placements_naive', 'team_allocation_naive', 'murder_mysteries_lexicon', 'object_placements_lexicon', 'team_allocation_lexicon'])

In [182]:
upload()

Uploading murder_mysteries


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


## Paraphrase Type: Syntax-Changes


In [183]:
def rephrase_sentence(sentence):
    # Parse the sentence
    doc = nlp(sentence)
    proper_nouns = {token.text.lower() for token in doc if token.pos_ == "PROPN" or token.ent_type_}

    # Example: Moving adverbs to the beginning
    words = []
    adverbs = []

    # Split ADV from sentence
    for token in doc:
        word = token.text_with_ws.capitalize() if token.text.lower() in proper_nouns else token.text_with_ws.lower()
        if token.pos_ == "ADV":  # Identify adverbs
            adverbs.append(word)
        else:
            words.append(word)

    reordered = "".join(adverbs + words)
    # Capitalize the first word of the sentence
    if reordered:
        reordered = reordered[0].upper() + reordered[1:]

    return reordered

# Test the function
sentence = "Alice quickly ran to the store in New York."
rephrased = rephrase_sentence(sentence)
print("Original:", sentence)
print("Rephrased:", rephrased)

Original: Alice quickly ran to the store in New York.
Rephrased: Quickly Alice ran to the store in New York.


In [184]:
import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Function to split a paragraph into sentences
def split_paragraph_into_sentences(paragraph):
    doc = nlp(paragraph)
    sentences = [sent.text for sent in doc.sents]
    return sentences

# Example paragraph
paragraph = """Because it was raining, she stayed indoors. The weather had been unpredictable all week.
She decided to read a book, hoping the rain would stop. By evening, the sun came out.
"""

# Split paragraph into sentences
sentences = split_paragraph_into_sentences(paragraph)

# Print the sentences
for sentence in sentences:
    print("#",sentence)


# Because it was raining, she stayed indoors.
# The weather had been unpredictable all week.

# She decided to read a book, hoping the rain would stop.
# By evening, the sun came out.



In [185]:
def rephrase_narrative(row):
    row["narrative"] = "".join([rephrase_sentence(sentence) for sentence in split_paragraph_into_sentences(row["narrative"])])
    return row


for subset in dataset.keys():
    df_syn = pd.DataFrame(dataset[subset])
    df_syn = df_syn.progress_apply(rephrase_narrative, axis=1)
    dataset_syn = Dataset.from_pandas(df_syn)
    dataset_dict[subset + "_syntax"] = dataset_syn

dataset_dict.keys()

100%|██████████| 250/250 [00:58<00:00,  4.29it/s]
100%|██████████| 256/256 [00:42<00:00,  5.96it/s]
100%|██████████| 250/250 [00:27<00:00,  9.18it/s]


dict_keys(['murder_mysteries', 'object_placements', 'team_allocation', 'murder_mysteries_naive', 'object_placements_naive', 'team_allocation_naive', 'murder_mysteries_lexicon', 'object_placements_lexicon', 'team_allocation_lexicon', 'murder_mysteries_syntax', 'object_placements_syntax', 'team_allocation_syntax'])

In [198]:
upload()

Uploading murder_mysteries


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_syntax


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements_syntax


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation_syntax


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


## Paraphrase Type: Deletion


not applicable

In [197]:
load_dataset("LFrancis/MuSR-NoOp-Plus",download_mode="force_redownload")

(…)ysteries_addition-00000-of-00001.parquet:   0%|          | 0.00/844k [00:00<?, ?B/s]

(…)mysteries_lexicon-00000-of-00001.parquet:   0%|          | 0.00/822k [00:00<?, ?B/s]

(…)r_mysteries_naive-00000-of-00001.parquet:   0%|          | 0.00/819k [00:00<?, ?B/s]

Generating murder_mysteries_addition split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating murder_mysteries_lexicon split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating murder_mysteries_naive split:   0%|          | 0/250 [00:00<?, ? examples/s]

DatasetDict({
    murder_mysteries_addition: Dataset({
        features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice'],
        num_rows: 250
    })
    murder_mysteries_lexicon: Dataset({
        features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice'],
        num_rows: 250
    })
    murder_mysteries_naive: Dataset({
        features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice'],
        num_rows: 250
    })
})