In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Setup model connection
from dotenv import load_dotenv
import os
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8085/v1"
)
model = "meta-llama/Llama-3.1-8B-Instruct"

## Load original dataset

In [None]:
from pprint import pprint

import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_subset = "test"
dataset = load_dataset("openai/gsm8k", "main")[selected_subset]
modified_dataset_name = "GSM8k-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

df = pd.DataFrame(dataset)
df

In [None]:
dataset_dict = DatasetDict()
dataset_dict["main"] = Dataset.from_pandas(df)
example_question = "How can one differentiate a set and a dictionary in python?"
dataset_dict

## Paraphrase Type: Naive Addition

In [None]:
dataset_dict["main_naive"] = convert_naive(pd.DataFrame(dataset_dict["main"]))
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Addition

In [None]:
dataset_dict["main_addition"] = convert_additional(pd.DataFrame(dataset_dict["main"]), client, model)
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Lexicon-Changes


In [None]:
dataset_dict["main_lexicon"] = convert_lexicon(pd.DataFrame(dataset_dict["main"]), client, model, nlp)
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Syntax-Changes


In [None]:
dataset_dict["main_syntax"] = convert_syntax(pd.DataFrame(dataset_dict["main"]), nlp)
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)