In [None]:
# Setup model connection
from dotenv import load_dotenv
import os
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8085/v1"
)
model = "meta-llama/Llama-3.1-8B-Instruct"

## Load original dataset

In [None]:
from pprint import pprint

import pandas
import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from openai import OpenAI
from tqdm import tqdm


# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_subset = "test"
dataset = load_dataset("cais/mmlu", "all")[selected_subset]
modified_dataset_name = "MMLU-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

df = pd.DataFrame(dataset)

dataset_dict = DatasetDict()
subjects = df['subject'].unique().tolist()
for subject in subjects:
    subject_df = df[df['subject'] == subject].reset_index(drop=True)
    subject_dataset = Dataset.from_pandas(subject_df)
    dataset_dict[subject] = subject_dataset
dataset_dict["all"] = Dataset.from_pandas(df)
dataset_dict

In [None]:
def calculate_all(suffix: str):
    if "all_"+suffix in list(dataset_dict.keys()):
        del dataset_dict["all_"+suffix]
    all_datasets = []
    for key in dataset_dict.keys():
        if key.split("_")[-1] == suffix:
            all_datasets.append(dataset_dict[key])
    all_naive = concatenate_datasets(all_datasets)
    dataset_dict["all_"+suffix] = all_naive
    print(len(dataset_dict["all_"+suffix]))
    pprint(dataset_dict.keys())

## Paraphrase Type: Naive Addition

In [None]:
for subject in subjects:
    dataset_dict[subject + "_naive"] = convert_naive(pd.DataFrame(dataset_dict[subject]))
pprint(list(dataset_dict.keys()))

In [None]:
calculate_all("naive")

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Addition

In [None]:

for subject in subjects:
    dataset_dict[subject + "_addition"] = convert_additional(pd.DataFrame(dataset_dict[subject]), client, model)
pprint(list(dataset_dict.keys()))

In [None]:
calculate_all("addition")

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Lexicon-Changes


In [None]:
for subject in subjects:
    dataset_dict[subject + "_lexicon"] = convert_lexicon(pd.DataFrame(dataset_dict[subject]), client, model, nlp)
dataset_dict.keys()

In [None]:
calculate_all("lexicon")

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Syntax-Changes


In [None]:
for subject in subjects:
    dataset_dict[subject + "_syntax"] = convert_syntax(pd.DataFrame(dataset_dict[subject]), nlp)
dataset_dict.keys()

In [None]:
calculate_all("syntax")

In [None]:
upload(dataset_dict, repo_id)