In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
# Setup model connection
from dotenv import load_dotenv
from openai import OpenAI

from converter.converter import *

load_dotenv("../.env")
client = OpenAI(
    api_key=os.getenv("VLLM_API_KEY"),
    base_url="http://134.76.18.30:8080/v1"
)
model = "meta-llama/Llama-3.3-70B-Instruct"

## Load original dataset

In [5]:
from pprint import pprint

import pandas as pd
import spacy
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

selected_split = "test"
dataset = load_dataset("openai/gsm8k", "main")[selected_split]
modified_dataset_name = "GSM8k-NoOp-Plus"

hf_username = "LFrancis"
repo_id = f"{hf_username}/{modified_dataset_name}"

df = pd.DataFrame(dataset)
df

Unnamed: 0,question,answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t..."
...,...,...
1314,John had a son James when he was 19. James is...,Dora is 12-3=<<12-3=9>>9\nSo James is 9*2=<<9*...
1315,There are some oranges in a basket. Ana spends...,There are 60 minutes in an hour. Ana peels an ...
1316,Mark's car breaks down and he needs to get a n...,The discount on the radiator was 400*.8=$<<400...
1317,"Farmer Brown has 20 animals on his farm, all e...",Let C be the number of chickens.\nThere are 20...


In [6]:
dataset_dict = DatasetDict()
dataset_dict["main"] = Dataset.from_pandas(df)
dataset_dict

DatasetDict({
    main: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

## Paraphrase Type: Naive Addition

In [None]:
dataset_dict["main_naive"] = convert_naive(pd.DataFrame(dataset_dict["main"]))
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Addition

In [None]:
dataset_dict["main_addition"] = convert_additional(pd.DataFrame(dataset_dict["main"]), client, model, nlp)
pprint(list(dataset_dict.keys()))

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Lexicon-Changes


In [None]:
dataset_dict["main_lexicon"] = convert_lexicon(pd.DataFrame(dataset_dict["main"]), client, model, nlp)
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Syntax-Changes


In [None]:
dataset_dict["main_syntax"] = convert_syntax(pd.DataFrame(dataset_dict["main"]), nlp)
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)

## Paraphrase Type: Typo

In [22]:
question = pd.DataFrame(dataset_dict["main"])["question"][0]
mistype_question(question)

"Janet’s ducks lay 16 egg peer day. She eats ghreee for breakfawt every moninng and bakes muffins for her friends every day with four. She sells thr remainser at tge farmwrs' market daily for $2 per resh duck gg. How much i dollars does she maake every day at thee farmers' market?"

In [23]:
for typo_type in [
    'typo_qwerty',
    'typo_doubling',
    'typo_deletion',
    'typo_hold_down',
    'typo_add_random',
]:
    dataset_dict["main_"+typo_type] = convert_transformation(pd.DataFrame(dataset_dict["main"]), typo_type)

dataset_dict.keys()

100%|██████████| 1319/1319 [00:00<00:00, 19608.72it/s]
100%|██████████| 1319/1319 [00:00<00:00, 28406.68it/s]
100%|██████████| 1319/1319 [00:00<00:00, 22282.54it/s]
100%|██████████| 1319/1319 [00:00<00:00, 11995.39it/s]
100%|██████████| 1319/1319 [00:00<00:00, 14325.93it/s]


dict_keys(['main', 'main_typo_qwerty', 'main_typo_doubling', 'main_typo_deletion', 'main_typo_hold_down', 'main_typo_add_random'])

In [24]:
upload(dataset_dict, repo_id)

Uploading main


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading main_typo_qwerty


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading main_typo_doubling


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Uploading main_typo_deletion


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Uploading main_typo_hold_down


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Uploading main_typo_add_random


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

In [21]:
typo_qwerty("hello everyone, whats going on")

'bello eceryone, wyats yoinf on'

## Paraphrase Type: Scramble

In [None]:
question = pd.DataFrame(dataset_dict["main"])["question"][0]
scramble_question(question)

In [None]:
dataset_dict["main_scramble"] = convert_transformation(pd.DataFrame(dataset_dict["main"]), "scramble_question")
dataset_dict.keys()

In [None]:
upload(dataset_dict, repo_id)