In [None]:
# Setup model connection
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8085/v1"
)
model = "meta-llama/Llama-3.1-8B-Instruct"

## Load original dataset

In [None]:
def get_subsets(dataset_name: str) -> list:
    import requests
    headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
    API_URL = f"https://datasets-server.huggingface.co/splits?dataset={dataset_name}"
    data = requests.get(API_URL, headers=headers).json()
    return data["splits"]

In [None]:
get_subsets("maveriq/bigbenchhard")

In [None]:
from pprint import pprint

import pandas
import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from openai import OpenAI
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

selected_subset = "train"
dataset_name = "maveriq/bigbenchhard"
modified_dataset_name = "BBH-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

dataset_dict = DatasetDict()
subjects = get_subsets(dataset_name)

dataset = load_dataset(dataset_name, "all")[selected_subset]
df = pd.DataFrame(dataset)
for subject in subjects:
    subject_df = df[df['subject'] == subject].reset_index(drop=True)
    subject_dataset = Dataset.from_pandas(subject_df)
    dataset_dict[subject] = subject_dataset
dataset_dict["all"] = Dataset.from_pandas(df)
example_question = "How can one differentiate a set and a dictionary in python?"
dataset_dict

In [None]:
def calculate_all(suffix: str):
    if "all_"+suffix in list(dataset_dict.keys()):
        del dataset_dict["all_"+suffix]
    all_datasets = []
    for key in dataset_dict.keys():
        if key.split("_")[-1] == suffix:
            all_datasets.append(dataset_dict[key])
    all_naive = concatenate_datasets(all_datasets)
    dataset_dict["all_"+suffix] = all_naive
    print(len(dataset_dict["all_"+suffix]))
    pprint(dataset_dict.keys())

## Paraphrase Type: Naive Addition

In [None]:
def add_sentence_to_question(question: str) -> str:
    return question + " " + "Sebastian goes to buy icecream."


print("Rows mentioning 'Sebastian':", len(df[df['question'].str.contains("Sebastian")]))
print("Rows mentioning 'icecream':", len(df[df['question'].str.contains("icecream")]))
add_sentence_to_question(example_question)

In [None]:
def add_naive_sentence(row):
    row["question"] = add_sentence_to_question(row["question"])
    return row

for subject in subjects:
    df_naive = pd.DataFrame(dataset_dict[subject])
    df_naive = df_naive.progress_apply(add_naive_sentence, axis=1)
    dataset_naive = Dataset.from_pandas(df_naive)
    dataset_dict[subject + "_naive"] = dataset_naive
pprint(list(dataset_dict.keys()))

In [None]:
calculate_all("naive")

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Addition

In [None]:
preprompt = "Give me one additional sentence to this question that has no direct effect on the question. The sentence should be about a similar topic. It should not be a question. Example additional sentences might focus on unnecessary details, add unnecessary information about other fields of study. Only output exactly the additional sentence and nothing else, the output will be copy/pasted as is. It is extremely important that the sentence does not effect the answer to the question."

addition_prompt = lambda question: str(preprompt + f"\nQuestion: \n'{question}'\nAdditional sentence:")

def add_additional_information(narrative: str) -> str:
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": addition_prompt(narrative),
            }
        ],
        model="gpt-4o",
    )
    result = response.choices[0].message.content
    return narrative + " " + result

add_additional_information(example_question)

In [None]:
def add_additional_sentence(row):
    row["question"] = add_additional_information(row["question"])
    return row

for subject in subjects:
    df_additional = pd.DataFrame(dataset_dict[subject])
    df_additional = df_additional.progress_apply(add_additional_sentence, axis=1)
    dataset_additional = Dataset.from_pandas(df_additional)
    dataset_dict[subject + "_addition"] = dataset_additional
pprint(list(dataset_dict.keys()))

In [None]:
calculate_all("addition")

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Lexicon-Changes


In [None]:
from concurrent.futures import ThreadPoolExecutor

def fetch_alternative(blanked, original_word):
    prompt = f"Output exactly one word that fits where the placeholder '<BLANK>' is placed and has similar meaning to '{original_word}'. No other output besides the word.\nText: '{''.join(blanked)}'"
    response = client.chat.completions.create(
        model="asdasd",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=5,
        top_logprobs=5,
        logprobs=True,
        temperature=0.0,
    )
    choice = response.choices[0]
    if choice.logprobs.content[0].logprob >= -1:
        return choice.message.content.lower().strip().replace(".", "")
    return original_word


def paraphrase_with_second_best_threading(text):
    doc = nlp(text)
    words = [token.text_with_ws for token in doc]
    pos_tags = [token.pos_ for token in doc]

    with ThreadPoolExecutor() as executor:
        futures = []
        for i, pos in enumerate(pos_tags):
            if pos == "ADJ":
                blanked = words.copy()
                blanked[i] = "<BLANK>"
                future = executor.submit(fetch_alternative, blanked, words[i])
                futures.append((i, future))

        for i, future in futures:
            alt = future.result()
            if alt != words[i]:
                words[i] = alt + doc[i].whitespace_

    return "".join(words)


story = "Once upon a time, there was a brave knight who fought dragons."

# Get the paraphrased story
paraphrased_story = paraphrase_with_second_best_threading(story)
print("Original Story:", story)
print("Paraphrased Story:", paraphrased_story)


In [None]:
def paraphrase_single_words(row):
    row["question"] = paraphrase_with_second_best_threading(row["question"])
    return row


for subject in subjects:
    df_lex = pd.DataFrame(dataset_dict[subject])
    df_lex = df_lex.progress_apply(paraphrase_single_words, axis=1)
    dataset_lex = Dataset.from_pandas(df_lex)
    dataset_dict[subject + "_lexicon"] = dataset_lex

dataset_dict.keys()

In [None]:
calculate_all("lexicon")

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Syntax-Changes


In [None]:
def rephrase_sentence(sentence):
    # Parse the sentence
    doc = nlp(sentence)
    proper_nouns = {token.text.lower() for token in doc if token.pos_ == "PROPN" or token.ent_type_}

    # Example: Moving adverbs to the beginning
    words = []
    adverbs = []

    # Split ADV from sentence
    for token in doc:
        word = token.text_with_ws.capitalize() if token.text.lower() in proper_nouns else token.text_with_ws.lower()
        if token.pos_ == "ADV":  # Identify adverbs
            adverbs.append(word)
        else:
            words.append(word)

    reordered = "".join(adverbs + words)
    # Capitalize the first word of the sentence
    if reordered:
        reordered = reordered[0].upper() + reordered[1:]

    return reordered

# Test the function
sentence = "Alice quickly ran to the store in New York."
rephrased = rephrase_sentence(sentence)
print("Original:", sentence)
print("Rephrased:", rephrased)

In [None]:
# Example paragraph
paragraph = """Because it was raining, she stayed indoors. The weather had been unpredictable all week.
She decided to read a book, hoping the rain would stop. By evening, the sun came out.
"""

def split_paragraph_into_sentences(paragraph):
    doc = nlp(paragraph)
    sentences = [sent.text for sent in doc.sents]
    return sentences

# Split paragraph into sentences
sentences = split_paragraph_into_sentences(paragraph)

# Print the sentences
for sentence in sentences:
    print("#",sentence)


In [None]:
def rephrase_narrative(row):
    row["question"] = "".join([rephrase_sentence(sentence) for sentence in split_paragraph_into_sentences(row["question"])])
    return row


for subject in subjects:
    df_syn = pd.DataFrame(dataset_dict[subject])
    df_syn = df_syn.progress_apply(rephrase_narrative, axis=1)
    dataset_syn = Dataset.from_pandas(df_syn)
    dataset_dict[subject + "_syntax"] = dataset_syn

dataset_dict.keys()

In [None]:
calculate_all("syntax")

In [None]:
upload(dataset_dict, repo_id)