In [6]:
import os
import argparse
import pickle as pkl
import random
import torch
import math
import json
import string
import logging
import numpy as np

from tqdm import tqdm
from collections import Counter, defaultdict

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import GPT2Tokenizer, AutoTokenizer


from llm_data import MetaICLData
from llm_model import MetaICLModel

In [7]:
from utils.data import load_data

seed = 100
config_split = "test"
k = 16
dataset = "glue-sst2"
is_null = False

train_data = load_data("glue-sst2", "train", k, seed=seed, config_split=config_split,
                               datasets=None if dataset is None else dataset.split(","))
dev_data = load_data("glue-sst2", "test", k, seed=seed, config_split=config_split,
                        datasets=None if dataset is None else dataset.split(","), is_null=is_null)


In [8]:
import pprint

print(len(train_data))
print(type(train_data))
pprint.pprint(train_data[0])
print("=====")
print(len(dev_data))
print(type(dev_data))
pprint.pprint(dev_data[0])

16
<class 'list'>
{'input': 'sentence: the stars may be college kids , but the subject matter is '
          'as adult as you can get :',
 'options': ['negative', 'positive'],
 'output': 'positive',
 'task': 'glue-sst2'}
=====
872
<class 'list'>
{'input': "sentence: it 's a charming and often affecting journey .",
 'options': ['negative', 'positive'],
 'output': 'positive',
 'task': 'glue-sst2'}


In [9]:
handlers = [logging.StreamHandler()]
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO,
                    handlers=handlers)
logger = logging.getLogger(__name__)

In [18]:
metaicl_data = MetaICLData(logger, "meta-llama/Llama-2-7b-chat-hf", "direct",False, 16,
                               256, 256)

In [19]:
_train_data = train_data
_test_data = dev_data

In [23]:
metaicl_data.tensorize(train_data, dev_data, add_newlines=True)

In [24]:
metaicl_data.tensorized_inputs["token_type_ids"][0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
metaicl_data.print_tensorized_example()

10/04/2023 00:06:26 - INFO - __main__ - Checking the first example...
Input:
<s> sentence: it 's a charming and often affecting journey . 
The answer is negative.<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><un

In [35]:
idx = 0

In [36]:
input_ids = metaicl_data.tensorized_inputs["input_ids"][idx]
token_type_ids = metaicl_data.tensorized_inputs["token_type_ids"][idx]
[_id for _id, _type_id in zip(input_ids, token_type_ids) if _type_id==1]

[tensor(29871),
 tensor(13),
 tensor(1576),
 tensor(1234),
 tensor(338),
 tensor(8178),
 tensor(29889)]

In [37]:
metaicl_data.tokenizer.decode([_id for _id, _type_id in zip(input_ids, token_type_ids) if _type_id==1])

'\nThe answer is negative.'

In [38]:
token_type_ids[..., 1:].contiguous()

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### dataloader

In [26]:
dataloader = metaicl_data.get_dataloader(4, is_training=False)

10/04/2023 00:07:27 - INFO - __main__ - torch.Size([1744, 256])


In [27]:
batch = next(iter(dataloader))

In [28]:
batch[0][:,4:10]

tensor([[  525, 29879,   263,  1373,  4056,   322],
        [  525, 29879,   263,  1373,  4056,   322],
        [ 1579, 22466, 11687, 10767,   557,   322],
        [ 1579, 22466, 11687, 10767,   557,   322]])

In [31]:
batch[2][0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
batch[0][0]

tensor([    1, 10541, 29901,   372,   525, 29879,   263,  1373,  4056,   322,
         4049,  6602,   292, 16342,   869, 29871,    13,  1576,  1234,   338,
         8178, 29889,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(datasets.__version__)

2.13.0


In [3]:
datasets.load_dataset('SetFit/subj')

Downloading readme: 100%|██████████| 248/248 [00:00<00:00, 746kB/s]


Downloading and preparing dataset json/SetFit--subj to /srv/home/zxu444/.cache/huggingface/datasets/SetFit___json/SetFit--subj-15fb6571305f6f49/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data: 100%|██████████| 1.45M/1.45M [00:00<00:00, 22.5MB/s]
Downloading data: 100%|██████████| 364k/364k [00:00<00:00, 10.5MB/s]]
Downloading data files: 100%|██████████| 2/2 [00:01<00:00,  1.79it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 266.98it/s]
                                                        

Dataset json downloaded and prepared to /srv/home/zxu444/.cache/huggingface/datasets/SetFit___json/SetFit--subj-15fb6571305f6f49/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 255.10it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})