In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
# Setup model connection
from dotenv import load_dotenv
import os
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8082/v1"
)
model = "meta-llama/Llama-3.3-70B-Instruct"

## Load original dataset

In [22]:
from pprint import pprint

import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_split = "test"
dataset = load_dataset("openai/gsm8k", "main")[selected_split]
modified_dataset_name = "GSM8k-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

df = pd.DataFrame(dataset)
df

Unnamed: 0,question,answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t..."
...,...,...
1314,John had a son James when he was 19. James is...,Dora is 12-3=<<12-3=9>>9\nSo James is 9*2=<<9*...
1315,There are some oranges in a basket. Ana spends...,There are 60 minutes in an hour. Ana peels an ...
1316,Mark's car breaks down and he needs to get a n...,The discount on the radiator was 400*.8=$<<400...
1317,"Farmer Brown has 20 animals on his farm, all e...",Let C be the number of chickens.\nThere are 20...


In [9]:
dataset_dict = DatasetDict()
dataset_dict["main"] = Dataset.from_pandas(df)
dataset_dict

DatasetDict({
    main: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

## Paraphrase Type: Naive Addition

In [None]:
dataset_dict["main_naive"] = convert_naive(pd.DataFrame(dataset_dict["main"]))
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Addition

In [None]:
dataset_dict["main_addition"] = convert_additional(pd.DataFrame(dataset_dict["main"]), client, model,nlp)
pprint(list(dataset_dict.keys()))

  2%|▏         | 24/1319 [00:17<15:41,  1.38it/s]

In [19]:
upload(dataset_dict, repo_id)

Uploading main


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_addition


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


## Paraphrase Type: Lexicon-Changes


In [12]:
dataset_dict["main_lexicon"] = convert_lexicon(pd.DataFrame(dataset_dict["main"]), client, model, nlp)
dataset_dict.keys()

100%|██████████| 1319/1319 [04:47<00:00,  4.58it/s]


dict_keys(['main', 'main_lexicon'])

In [13]:
upload(dataset_dict, repo_id)

Uploading main


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_lexicon


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

## Paraphrase Type: Syntax-Changes


In [None]:
dataset_dict["main_syntax"] = convert_syntax(pd.DataFrame(dataset_dict["main"]), nlp)
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)