In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Setup model connection
from dotenv import load_dotenv
import os
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8085/v1"
)
model = "meta-llama/Llama-3.1-8B-Instruct"

## Load original dataset

In [None]:
def get_subsets(dataset_name: str) -> list:
    import requests
    headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
    API_URL = f"https://datasets-server.huggingface.co/splits?dataset={dataset_name}"
    data = requests.get(API_URL, headers=headers).json()
    return [subset["config"] for subset in data["splits"]]

In [None]:
from pprint import pprint

import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_split = "test"
dataset_name = "maveriq/bigbenchhard"
modified_dataset_name = "BBH-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

In [None]:
subsets = get_subsets(dataset_name)
def reset_dict():
    data_dict = DatasetDict()
    for s in subsets:
        d = load_dataset(dataset_name, s)["train"]
        df = pd.DataFrame(d)
        dataset_dict[s] = Dataset.from_pandas(df)
    return data_dict
dataset_dict = reset_dict()
dataset_dict

## Paraphrase Type: Naive Addition

In [None]:
for subset in subsets:
    dataset_dict[subset+"_naive"] = convert_naive(pd.DataFrame(dataset_dict[subset]), question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)
dataset_dict = reset_dict()

## Paraphrase Type: Addition

In [None]:
for subset in subsets:
    dataset_dict[subset+"_addition"] = convert_additional(pd.DataFrame(dataset_dict[subset]), client, model, question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)
dataset_dict = reset_dict()

## Paraphrase Type: Lexicon-Changes


In [None]:
for subset in subsets:
    dataset_dict[subset+"_lexicon"] = convert_lexicon(pd.DataFrame(dataset_dict[subset]), client, model,nlp, question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)
dataset_dict = reset_dict()

## Paraphrase Type: Syntax-Changes


In [None]:
for subset in subsets:
    dataset_dict[subset+"_syntax"] = convert_syntax(pd.DataFrame(dataset_dict[subset]),nlp, question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)
dataset_dict = reset_dict()