In [None]:
# Setup model connection
from dotenv import load_dotenv
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8098/v1"
)
model = "meta-llama/Llama-3.3-70B-Instruct"

## Load original dataset

In [None]:
import pandas as pd
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
import spacy

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

dataset = load_dataset("TAUR-Lab/MuSR")
modified_dataset_name = "MuSR-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

dataset_dict = DatasetDict(dataset)
dataset

## Paraphrase Type: Naive Addition

In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_naive"] = convert_naive(pd.DataFrame(dataset[subset]), question_column="narrative",
                                                    subject_list_column="choices")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Addition

In [None]:
custom_preprompt = "Give me one additional sentence to this story that has no direct effect on the storyline. The sentence should include one of the characters. Example additional sentences might focus on unnecessary details, add unnecessary information about one of the characters past or thoughts. Only output exactly only the additional sentence and nothing else, the output will be copy/pasted as is. It is extremely important that the sentence does not effect the storyline. The sentence should try to confuse an inattentive detective."

In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_addition"] = convert_additional(pd.DataFrame(dataset[subset]), client, model, nlp,
                                                            custom_preprompt=custom_preprompt,
                                                            question_column="narrative")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Lexicon-Changes


In [4]:
for subset in dataset.keys():
    dataset_dict[subset + "_lexicon"] = convert_lexicon(pd.DataFrame(dataset[subset]), client, model, nlp,
                                                        question_column="narrative")
dataset_dict.keys()

100%|██████████| 250/250 [57:38<00:00, 13.83s/it] 
100%|██████████| 256/256 [57:27<00:00, 13.47s/it] 
100%|██████████| 250/250 [23:17<00:00,  5.59s/it]


dict_keys(['murder_mysteries', 'object_placements', 'team_allocation', 'murder_mysteries_lexicon', 'object_placements_lexicon', 'team_allocation_lexicon'])

In [5]:
upload(dataset_dict, repo_id)

Uploading murder_mysteries


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading object_placements


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading team_allocation


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading murder_mysteries_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading object_placements_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

Uploading team_allocation_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

## Paraphrase Type: Syntax-Changes


In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_syntax"] = convert_syntax(pd.DataFrame(dataset[subset]), nlp, question_column="narrative")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Typo

In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_typo"] = convert_typo(pd.DataFrame(dataset[subset]), question_column="narrative")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Scramble

In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_scramble"] = convert_scramble(pd.DataFrame(dataset[subset]),nlp, question_column="narrative")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)