In [4]:
# Setup model connection
from dotenv import load_dotenv
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8098/v1"
)
model = "meta-llama/Llama-3.3-70B-Instruct"

## Load original dataset

In [5]:
from pprint import pprint

import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_split = "test"
dataset = load_dataset("cais/mmlu", "all")[selected_split]
modified_dataset_name = "MMLU-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

df = pd.DataFrame(dataset)
dataset_dict = DatasetDict()
dataset_dict["all"] = Dataset.from_pandas(df)
dataset_dict

DatasetDict({
    all: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 14042
    })
})

## Paraphrase Type: Naive Addition

In [6]:
dataset_dict["all_naive"] = convert_naive(pd.DataFrame(dataset_dict["all"]))
pprint(list(dataset_dict.keys()))

Rows mentioning Sebastian: 0
Rows mentioning 'icecream': 0


100%|██████████| 14042/14042 [00:00<00:00, 37133.60it/s]

['all', 'all_naive']





In [7]:
upload(dataset_dict, repo_id)

Uploading all


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


## Paraphrase Type: Addition

In [8]:
dataset_dict["all_addition"] = convert_additional(pd.DataFrame(dataset_dict["all"]), client, model, nlp, )
pprint(list(dataset_dict.keys()))

100%|██████████| 14042/14042 [13:19:10<00:00,  3.41s/it]        

['all', 'all_naive', 'all_addition']





In [9]:
upload(dataset_dict, repo_id)

Uploading all


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

## Paraphrase Type: Lexicon-Changes


In [10]:
dataset_dict["all_lexicon"] = convert_lexicon(pd.DataFrame(dataset_dict["all"]), client, model, nlp)
dataset_dict.keys()

100%|██████████| 14042/14042 [1:00:05<00:00,  3.89it/s]


dict_keys(['all', 'all_naive', 'all_addition', 'all_lexicon'])

In [11]:
upload(dataset_dict, repo_id)

Uploading all


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/123k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

## Paraphrase Type: Syntax-Changes


In [12]:
dataset_dict["all" + "_syntax"] = convert_syntax(pd.DataFrame(dataset_dict["all"]), nlp)
dataset_dict.keys()

100%|██████████| 14042/14042 [02:37<00:00, 89.14it/s] 


dict_keys(['all', 'all_naive', 'all_addition', 'all_lexicon', 'all_syntax'])

In [13]:
upload(dataset_dict, repo_id)

Uploading all


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/123k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_syntax


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

## Paraphrase Type: Typo

In [16]:
dataset_dict["all" + "_typo"] = convert_typo(pd.DataFrame(dataset_dict["all"]))
dataset_dict.keys()

100%|██████████| 14042/14042 [00:01<00:00, 12616.28it/s]


dict_keys(['all', 'all_naive', 'all_addition', 'all_lexicon', 'all_syntax', 'all_typo'])

In [17]:
upload(dataset_dict, repo_id)

Uploading all


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/123k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_syntax


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_typo


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

## Paraphrase Type: Scramble

In [18]:
dataset_dict["all" + "_scramble"] = convert_scramble(pd.DataFrame(dataset_dict["all"]), nlp)
dataset_dict.keys()

100%|██████████| 14042/14042 [01:06<00:00, 211.20it/s]


dict_keys(['all', 'all_naive', 'all_addition', 'all_lexicon', 'all_syntax', 'all_typo', 'all_scramble'])

In [19]:
upload(dataset_dict, repo_id)

Uploading all


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/123k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_naive


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_syntax


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_typo


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading all_scramble


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]