In [1]:
import pandas as pd
import numpy as np
import json
import requests
import pyarrow.parquet as pq
import os
import glob
import textstat
import string
from collections import Counter
import math
import re

In [5]:
def convert_parquet_to_csv(parquet_file, csv_file=None):

    if csv_file is None:
        csv_file = os.path.splitext(parquet_file)[0] + '.csv'
    

    print(f"Reading {parquet_file}...")
    df = pd.read_parquet(parquet_file)
    

    print(f"Writing to {csv_file}...")
    df.to_csv(csv_file, index=False)
    
    print(f"Conversion complete: {parquet_file} → {csv_file}")
    print(f"CSV file size: {os.path.getsize(csv_file) / (1024 * 1024):.2f} MB")
    
    return csv_file

In [6]:
# convert_parquet_to_csv("test-asdiv.parquet")

# GSM8K

In [7]:
train_gsm8k = pd.read_csv("train-gsm8k.csv")
train_gsm8k.head()

Unnamed: 0,question,answer
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<..."
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...


In [8]:
test_gsm8k = pd.read_csv("test-gsm8k.csv")
test_gsm8k.head()

Unnamed: 0,question,answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t..."


# SVAMP

In [9]:
train_svamp = pd.read_csv("train-svamp.csv")
train_svamp.head()

Unnamed: 0,ID,Body,Question,Equation,Answer,Type,question_concat
0,chal-777,There are 87 oranges and 290 bananas in Philip...,How big is each group of bananas?,( 290.0 / 2.0 ),145,Common-Division,There are 87 oranges and 290 bananas in Philip...
1,chal-508,Marco and his dad went strawberry picking. Mar...,How much did Marco's strawberries weigh?,( 30.0 - 11.0 ),19,Subtraction,Marco and his dad went strawberry picking. Mar...
2,chal-896,Edward spent $ 6 to buy 2 books each book cost...,How much did each book cost?,( 6.0 / 2.0 ),3,Common-Division,Edward spent $ 6 to buy 2 books each book cost...
3,chal-923,Frank was reading through his favorite book. T...,How many pages are in each chapter?,( 594.0 / 3.0 ),198,Common-Division,Frank was reading through his favorite book. T...
4,chal-34,There were 78 dollars in Olivia's wallet. She ...,How much money does she have left?,( 78.0 - 15.0 ),63,Subtraction,There were 78 dollars in Olivia's wallet. She ...


In [10]:
test_svamp = pd.read_csv("test-svamp.csv")
test_svamp.head()

Unnamed: 0,ID,Body,Question,Equation,Answer,Type,question_concat
0,chal-736,Winter is almost here and most animals are mig...,How many more bird families flew away to afric...,( 62.0 - 35.0 ),27,Subtraction,Winter is almost here and most animals are mig...
1,chal-162,Paige raised 7 goldfish and 12 catfish in the ...,How many fishes disappeared?,( ( 7.0 + 12.0 ) - 15.0 ),4,Subtraction,Paige raised 7 goldfish and 12 catfish in the ...
2,chal-349,Marco and his dad went strawberry picking. Tog...,How much did his dad's strawberries weigh now?,( ( 22.0 - 36.0 ) + 30.0 ),16,Addition,Marco and his dad went strawberry picking. Tog...
3,chal-390,Debby bought 200 water bottles and 256 soda bo...,How many days would the soda bottles last?,( 256.0 / 4.0 ),64,Common-Division,Debby bought 200 water bottles and 256 soda bo...
4,chal-781,There were 106 dollars in Olivia's wallet. Aft...,How much did she spend at the supermarket?,( ( 106.0 - 26.0 ) - 49.0 ),31,Subtraction,There were 106 dollars in Olivia's wallet. Aft...


# ASDiv

In [11]:
train_asdiv = pd.read_csv("train-asdiv.csv")
train_asdiv.head()

Unnamed: 0,Question,Numbers,Equation,Answer,group_nums,Body,Ques,Type,Variation Type,id
0,Bryan took a look at his books as well . If Br...,[56. 9.],['*' 'number0' 'number1'],504.0,[12 13 14 18 19 20 27 28 29],Bryan took a look at his books as well . If Br...,how many books does he have in total ?,,,0
1,"For the fifth grade play , the chairs have bee...",[27. 16.],['*' 'number0' 'number1'],432.0,[11 12 13 14 15 16 28 29 30],"For the fifth grade play , the chairs have bee...",How many chairs have been put out for the play ?,,,1
2,There are number0 short trees and number1 tall...,[41. 44. 57.],['+' 'number0' 'number2'],98.0,[ 1 2 3 5 6 7 17 18 19 33 34 35],There are number0 short trees and number1 tall...,How many short trees will the park have when t...,,,2
3,Conner has number0 dollars in his bank account...,[2.5e+04 1.5e+03 8.0e+00],['-' 'number0' '*' 'number1' 'number2'],13000.0,[ 1 2 3 12 13 14 34 35 36 34 35 36],Conner has number0 dollars in his bank account...,How much money will Conner have in his account...,,,3
4,There are number0 dogwood trees currently in t...,[34. 49.],['+' 'number0' 'number1'],83.0,[ 1 2 3 13 14 15 29 30 31],There are number0 dogwood trees currently in t...,How many dogwood trees will the park have when...,,,4


In [12]:
test_asdiv = pd.read_csv("test-asdiv.csv")
test_asdiv.head()

Unnamed: 0,Question,Numbers,Equation,Answer,group_nums,Body,Ques,Type,Variation Type,id
0,julia played tag with number0 kids on monday ....,[18. 10.],['-' 'number0' 'number1'],8.0,[ 1 2 3 4 5 6 12 13 14 28 29 30],julia played tag with number0 kids on monday ....,how many more kids did she play with on monday...,Subtraction,[11],0
1,julia played tag with number0 kids on monday ....,[11. 12.],['-' 'number1' 'number0'],1.0,[ 1 2 3 4 5 6 12 13 14 28 29 30],julia played tag with number0 kids on monday ....,how many more kids did she play with on tuesda...,Subtraction,[11],1
2,julia played tag with number0 kids on monday ....,[ 5. 15.],['-' 'number1' 'number0'],10.0,[34 3 4 5 35 36 10 14 25 26 27],julia played tag with number0 kids on monday ....,how many kids did she play with on tuesday ?,Subtraction,[23],2
3,julia played tag with some kids on monday . sh...,[14. 16.],['-' 'number1' 'number0'],2.0,[33 34 35 10 12 13 14 24 25 26],julia played tag with some kids on monday . sh...,how many kids did she play with on monday ?,Subtraction,[23],3
4,julia played tag with number0 kids on monday ....,[16. 12.],['-' 'number0' 'number1'],4.0,[ 3 4 5 6 13 14 15 16 18 28 29 30],julia played tag with number0 kids on monday ....,how many kids did she play with on tuesday ?,Subtraction,[11 23],4
