In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Setup model connection
from dotenv import load_dotenv
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8086/v1"
)
model = "meta-llama/Llama-3.3-70B-Instruct"

## Load original dataset

In [4]:
def get_subsets(dataset_name: str) -> list:
    import requests
    headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
    API_URL = f"https://datasets-server.huggingface.co/splits?dataset={dataset_name}"
    data = requests.get(API_URL, headers=headers).json()
    return [subset["config"] for subset in data["splits"]]

In [5]:
from pprint import pprint

import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_split = "test"
dataset_name = "maveriq/bigbenchhard"
modified_dataset_name = "BBH-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

In [6]:
subsets = get_subsets(dataset_name)


def reset_dict():
    data_dict = DatasetDict()
    for s in subsets:
        d = load_dataset(dataset_name, s)["train"]
        df = pd.DataFrame(d)
        data_dict[s] = Dataset.from_pandas(df)
    return data_dict


dataset_dict = reset_dict()
dataset_dict

DatasetDict({
    boolean_expressions: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    causal_judgement: Dataset({
        features: ['input', 'target'],
        num_rows: 187
    })
    date_understanding: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    disambiguation_qa: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    dyck_languages: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    formal_fallacies: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    geometric_shapes: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    hyperbaton: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    logical_deduction_five_objects: Dataset({
        features: ['input', 'target'],
        num_rows: 250
    })
    logical_deduction_seven_objects: Dataset({
        features: ['input',

## Paraphrase Type: Naive Addition

In [None]:
for subset in subsets:
    dataset_dict[subset + "_naive"] = convert_naive(pd.DataFrame(dataset_dict[subset]), question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id, subsets)
dataset_dict = reset_dict()

## Paraphrase Type: Addition

In [9]:
for subset in subsets:
    dataset_dict[subset + "_addition"] = convert_additional(pd.DataFrame(dataset_dict[subset]), client, model, nlp,question_column="input")
pprint(list(dataset_dict.keys()))

100%|██████████| 250/250 [03:04<00:00,  1.35it/s]
100%|██████████| 187/187 [08:38<00:00,  2.77s/it]  
100%|██████████| 250/250 [03:14<00:00,  1.29it/s]
100%|██████████| 250/250 [03:06<00:00,  1.34it/s]
100%|██████████| 250/250 [03:01<00:00,  1.38it/s]
100%|██████████| 250/250 [03:30<00:00,  1.19it/s]
100%|██████████| 250/250 [02:59<00:00,  1.40it/s]
100%|██████████| 250/250 [02:51<00:00,  1.46it/s]
100%|██████████| 250/250 [03:30<00:00,  1.19it/s]
100%|██████████| 250/250 [03:34<00:00,  1.17it/s]
100%|██████████| 250/250 [03:22<00:00,  1.23it/s]
100%|██████████| 250/250 [03:26<00:00,  1.21it/s]
100%|██████████| 250/250 [03:15<00:00,  1.28it/s]
100%|██████████| 250/250 [02:55<00:00,  1.43it/s]
100%|██████████| 250/250 [02:44<00:00,  1.52it/s]
100%|██████████| 146/146 [02:02<00:00,  1.20it/s]
100%|██████████| 250/250 [03:10<00:00,  1.31it/s]
100%|██████████| 250/250 [03:19<00:00,  1.25it/s]
100%|██████████| 250/250 [03:51<00:00,  1.08it/s]
100%|██████████| 178/178 [02:14<00:00,  1.33it/s

['boolean_expressions',
 'causal_judgement',
 'date_understanding',
 'disambiguation_qa',
 'dyck_languages',
 'formal_fallacies',
 'geometric_shapes',
 'hyperbaton',
 'logical_deduction_five_objects',
 'logical_deduction_seven_objects',
 'logical_deduction_three_objects',
 'movie_recommendation',
 'multistep_arithmetic_two',
 'navigate',
 'object_counting',
 'penguins_in_a_table',
 'reasoning_about_colored_objects',
 'ruin_names',
 'salient_translation_error_detection',
 'snarks',
 'sports_understanding',
 'temporal_sequences',
 'tracking_shuffled_objects_five_objects',
 'tracking_shuffled_objects_seven_objects',
 'tracking_shuffled_objects_three_objects',
 'web_of_lies',
 'word_sorting',
 'boolean_expressions_addition',
 'causal_judgement_addition',
 'date_understanding_addition',
 'disambiguation_qa_addition',
 'dyck_languages_addition',
 'formal_fallacies_addition',
 'geometric_shapes_addition',
 'hyperbaton_addition',
 'logical_deduction_five_objects_addition',
 'logical_deduction_




In [10]:
upload(dataset_dict, repo_id, subsets)
dataset_dict = reset_dict()

Uploading boolean_expressions_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading causal_judgement_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading date_understanding_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading disambiguation_qa_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading dyck_languages_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading formal_fallacies_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading geometric_shapes_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading hyperbaton_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading logical_deduction_five_objects_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading logical_deduction_seven_objects_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading logical_deduction_three_objects_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading movie_recommendation_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading multistep_arithmetic_two_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading navigate_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading object_counting_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading penguins_in_a_table_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading reasoning_about_colored_objects_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading ruin_names_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading salient_translation_error_detection_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading snarks_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading sports_understanding_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading temporal_sequences_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading tracking_shuffled_objects_five_objects_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading tracking_shuffled_objects_seven_objects_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading tracking_shuffled_objects_three_objects_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading web_of_lies_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Uploading word_sorting_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

## Paraphrase Type: Lexicon-Changes


In [None]:
for subset in subsets:
    dataset_dict[subset + "_lexicon"] = convert_lexicon(pd.DataFrame(dataset_dict[subset]), client, model, nlp,
                                                        question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id, subsets)
dataset_dict = reset_dict()

## Paraphrase Type: Syntax-Changes


In [None]:
for subset in subsets:
    dataset_dict[subset + "_syntax"] = convert_syntax(pd.DataFrame(dataset_dict[subset]), nlp, question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id, subsets)
dataset_dict = reset_dict()

## Paraphrase Type: Typo

In [None]:
for subset in subsets:
    dataset_dict[subset + "_typo"] = convert_typo(pd.DataFrame(dataset_dict[subset]), question_column="input")
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id, subsets)
dataset_dict = reset_dict()

## Paraphrase Type: Scramble

In [None]:
for subset in subsets:
    dataset_dict[subset + "_scramble"] = convert_scramble(pd.DataFrame(dataset_dict[subset]), nlp,
                                                          question_column="input")
pprint(list(dataset_dict.keys()))
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id, subsets)
dataset_dict = reset_dict()