## Choices
* Only use type 1, 4, 8

In [1]:
import json
import copy
from transformers import AutoTokenizer
import datasets

model = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

splits = ['dev', 'test', 'train']
data = []
for split in splits:
    with open(f"raw/NumGLUE_{split}.json", "r") as f:
        for line in f:
            tmp = json.loads(line)
            tmp['split'] = split
            data.append(tmp)  

In [3]:

import pandas as pd
acceptable_types = ["Type_1", "Type_4", "Type_8"]
df = pd.DataFrame(data)  
df = df[df['answer'].apply(lambda x: isinstance(x, int))]
df = df[df['type'] != 'Type_2']
df['answer'] = df['answer'].map(lambda x: " ".join(list(str(x))))
df = df.reset_index(drop=True)
important_columns = ["question","answer","type","split"]
df = df[important_columns]

In [4]:
df.head()

Unnamed: 0,question,answer,type,split
0,A 8 minutes scene from an hour long movie was ...,5 2,Type_1,dev
1,"For an entire month of March, Josiah purchased...",9 9 2,Type_1,dev
2,Willow played football for 60 minutes and then...,2,Type_1,dev
3,Nora paid for his watch in dimes. If the cost ...,9 0,Type_1,dev
4,The newly constructed sports stadium is 62 yar...,1 8 6,Type_1,dev


In [5]:
import datasets
import pandas as pd
tokenized_df= df
tmp = pd.DataFrame(list(df['question'].map(lambda x: tokenizer(x))))
tokenized_df['input_ids'] = tmp['input_ids']
tokenized_df['attention_mask'] = tmp['attention_mask']
tokenized_df['labels'] = pd.DataFrame(list(df['answer'].map(lambda x: tokenizer(x))))['input_ids']

In [6]:
df.head()

Unnamed: 0,question,answer,type,split,input_ids,attention_mask,labels
0,A 8 minutes scene from an hour long movie was ...,5 2,Type_1,dev,"[32, 220, 23, 4420, 6109, 504, 458, 6460, 1293...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[20, 220, 17]"
1,"For an entire month of March, Josiah purchased...",9 9 2,Type_1,dev,"[2461, 458, 4453, 2254, 315, 5470, 11, 27878, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[24, 220, 24, 220, 17]"
2,Willow played football for 60 minutes and then...,2,Type_1,dev,"[9945, 363, 6342, 8964, 369, 220, 21, 15, 4420...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",[17]
3,Nora paid for his watch in dimes. If the cost ...,9 0,Type_1,dev,"[45, 6215, 7171, 369, 806, 3736, 304, 294, 173...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[24, 220, 15]"
4,The newly constructed sports stadium is 62 yar...,1 8 6,Type_1,dev,"[785, 13631, 20346, 9833, 23889, 374, 220, 21,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[16, 220, 23, 220, 21]"


In [7]:
df.to_json('tokenized/data.json', orient='records', indent=4)

In [8]:
#need to clean up the nones. 
df[df['split'] == 'train'].head()

Unnamed: 0,question,answer,type,split,input_ids,attention_mask,labels
680,A football team practices for 6 hours daily. T...,3 6,Type_1,train,"[32, 8964, 2083, 12378, 369, 220, 21, 4115, 72...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[18, 220, 21]"
681,A shopkeeper has 7 decks of playing cards. How...,1 8 2,Type_1,train,"[32, 8061, 18861, 702, 220, 22, 29781, 315, 56...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[16, 220, 23, 220, 17]"
682,The hiking team needs to arrange gloves for ev...,8 6,Type_1,train,"[785, 37364, 2083, 3880, 311, 30893, 35416, 36...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[23, 220, 21]"
683,Mateo works everyday and gets his salary of 79...,3 3 9,Type_1,train,"[96642, 78, 4278, 17778, 323, 5221, 806, 16107...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[18, 220, 18, 220, 24]"
684,Thomas spends 4k dollars every year on his car...,4 0,Type_1,train,"[41393, 37102, 220, 19, 74, 11192, 1449, 1042,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[19, 220, 15]"


In [20]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

class NumGlueDataset(Dataset):
    def __init__(self, split):
        assert split in ['dev', 'test','train'], "no valid split, please choose from dev, test, train."
        df = pd.read_json("tokenized/data.json")
        print(df.isna().any())
        splits = {name:group for name, group in df.groupby("split")}
        self.data = splits[split]
        
    def __len__(self):
        print(len(self.data))
        return len(self.data)
    
    def __getitem__(self, idx):
        print(idx)
        self.data = self.data[['input_ids', 'attention_mask', 'labels']]
        point = self.data.iloc[idx] 
        dict_form = point.to_dict()
        ret_form = {}
        for key, val in dict_form.items():
            ret_form[key] = torch.tensor(val)
        return ret_form

def collate_fn(batch):
    df = pd.DataFrame(batch)
    out_dict = {}
    for col in df.columns:
        out_dict[col] = pad_sequence(list(df[col]), batch_first=True)
    return out_dict
data = NumGlueDataset('dev')
        

question          False
answer            False
type              False
split             False
input_ids         False
attention_mask    False
labels            False
dtype: bool


In [21]:
dl = DataLoader(data, batch_size=2, shuffle=True, collate_fn=collate_fn) #returns a dict of str --> tensors. 
for d in dl:
    break

250
250
250
250
14
232
{'input_ids': tensor([[   33, 30126, 11079, 34771,   429, 39696,   374,   537, 12515,  6529,
           304,   279,   536,   323, 17064,  1435,   311,  1895,   279,  1372,
           315, 40811,   304,   279, 26594,   421,  1052,   525,   264,  2790,
           315,   220,    19,    23, 25989, 14201,    13,  3555,  1265,   387,
         39696,   594,  4226,    30,     0,     0,     0,     0],
        [   32,  8424,   380,  3880,   311,  1281,   220,    18,    15, 48038,
           315,   264,   220,    17,    20,  1018, 12904,  6291,   553, 26792,
          3786,   264,   220,    16,    20,  1018, 12904,  6291,   448,   264,
           220,    19,    15,  1018, 12904,  6291,    13,  2585,  1657, 48038,
           315,   279,   220,    19,    15,  1018,  6291,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1