In [None]:
# Setup model connection
from dotenv import load_dotenv
import os
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8085/v1"
)
model = "meta-llama/Llama-3.1-8B-Instruct"

## Load original dataset

In [None]:
import pandas as pd
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
import spacy

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

dataset = load_dataset("TAUR-Lab/MuSR")
modified_dataset_name = "MuSR-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

dataset_dict = DatasetDict(dataset)
example_story = "Once upon a time, there was a brave knight who fought dragons."
dataset

## Paraphrase Type: Naive Addition

In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_naive"] = convert_naive(pd.DataFrame(dataset[subset]),question_column="narrative", subject_list_column="choices")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Addition

In [None]:
custom_preprompt = "Give me one additional sentence to this story that has no direct effect on the storyline. The sentence should include one of the characters. Example additional sentences might focus on unnecessary details, add unnecessary information about one of the characters past or thoughts. Only output exactly only the additional sentence and nothing else, the output will be copy/pasted as is. It is extremely important that the sentence does not effect the storyline."

In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_addition"] = convert_additional(pd.DataFrame(dataset[subset]), client, model, custom_preprompt=custom_preprompt, question_column="narrative")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Lexicon-Changes


In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_lexicon"] = convert_lexicon(pd.DataFrame(dataset[subset]), client, model, nlp, question_column="narrative")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Syntax-Changes


In [None]:
for subset in dataset.keys():
    dataset_dict[subset + "_syntax"] = convert_syntax(pd.DataFrame(dataset[subset]), nlp, question_column="narrative")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)