In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from functools import partial

def encode_with_prompt_completion_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has 'prompt' and 'completion' fields.
    We concatenate prompt and completion and tokenize them together because otherwise prompt will be padded/trancated 
    and it doesn't make sense to follow directly with the completion.
    '''
    # if prompt doesn't end with space and completion doesn't start with space, add space
    if not example['prompt'].endswith((' ', '\n', '\t')) and not example['completion'].startswith((' ', '\n', '\t')):
        example_text = example['prompt'] + ' ' + example['completion']
    else:
        example_text = example['prompt'] + example['completion']
    example_text = example_text + tokenizer.eos_token
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()
    tokenized_prompt = tokenizer(example['prompt'], return_tensors='pt', max_length=max_seq_length, truncation=True)
    # mask the prompt part for avoiding loss
    # labels[:, :tokenized_prompt.input_ids.shape[1]] = -100
    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }


def encode_with_messages_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has a 'messages' field Each message is a dict with 'role' and 'content' fields.
    We concatenate all messages with the roles as delimiters and tokenize them together.
    '''
    messages = example['messages']
    if len(messages) == 0:
        raise ValueError('messages field is empty.')
    
    def _concat_messages(messages):
        message_text = ""
        for message in messages:
            if message["role"] == "system":
                message_text += "<|system|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "user":
                message_text += "<|user|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "assistant":
                message_text += "<|assistant|>\n" + message["content"].strip() + tokenizer.eos_token + "\n"
            else:
                raise ValueError("Invalid role: {}".format(message["role"]))
        return message_text
        
    example_text = _concat_messages(messages).strip()
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()

    # mask the non-assistant part for avoiding loss
    for message_idx, message in enumerate(messages):
        if message["role"] != "assistant":
            if message_idx == 0:
                message_start_idx = 0
            else:
                message_start_idx = tokenizer(
                    _concat_messages(messages[:message_idx]), return_tensors='pt', max_length=max_seq_length, truncation=True
                ).input_ids.shape[1]
            if message_idx < len(messages) - 1 and messages[message_idx+1]["role"] == "assistant":
                # here we also ignore the role of the assistant
                messages_so_far = _concat_messages(messages[:message_idx+1]) + "<|assistant|>\n"
            else:
                messages_so_far = _concat_messages(messages[:message_idx+1])
            message_end_idx = tokenizer(
                messages_so_far,
                return_tensors='pt', 
                max_length=max_seq_length, 
                truncation=True
            ).input_ids.shape[1]
            # labels[:, message_start_idx:message_end_idx] = -100

            if message_end_idx >= max_seq_length:
                break

    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

dataset_name='random_1'
model_name_or_path = "meta-llama/Llama-3.2-3B"
data_path = f"selected_data/{dataset_name}.json"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
raw_dataset = load_dataset("json", data_files=data_path)

if "prompt" in raw_dataset["train"].column_names and "completion" in raw_dataset["train"].column_names:
    encode_function = partial(
        encode_with_prompt_completion_format,
        tokenizer=tokenizer,
        max_seq_length= 2048,
        add_bos= False,
    )
elif "messages" in raw_dataset["train"].column_names:
    encode_function = partial(
        encode_with_messages_format,
        tokenizer=tokenizer,
        max_seq_length= 2048,
        add_bos= False,
    )
    
raw_dataset = raw_dataset.map(
    lambda example, idx: {"idx": idx},
    with_indices=True,  
    desc="Adding idx column",
)
        

lm_datasets = raw_dataset.map(
    encode_function,
    batched=False,
    # remove_columns=[name for name in raw_dataset["train"].column_names if name not in ["idx", "input_ids", "labels", "attention_mask"]],
    desc="Tokenizing and reformatting instruction data",
)

train_dataset = lm_datasets['train']

In [14]:
selected_labels = [[-100 for _ in range(len(label))] for label in train_dataset['labels']]

In [13]:
for label in train_dataset['labels']:
    print(len(label))

320
128
449
133
256
34
353
122
69
39
56
58
475
229
305
294
301
156
101
106
36
383
71
178
509
71
414
644
29
46
123
323
85
158
178
172
324
301
66
1778
612
348
249
248
216
59
112
520
603
79
94
483
523
468
530
1467
125
104
118
505
450
712
410
227
387
242
905
157
476
242
64
361
203
49
142
237
52
244
133
175
120
179
596
38
180
102
73
444
226
289
78
502
723
64
633
200
400
122
322
375
41
1994
572
122
213
171
59
62
185
176
177
362
471
218
1402
83
82
372
73
823
1491
536
46
704
324
408
239
33
55
524
512
56
198
598
187
54
292
430
380
77
345
559
605
86
1079
2048
36
304
761
298
173
170
562
235
99
1486
216
1298
57
39
150
304
77
279
61
123
575
257
35
394
40
60
218
50
83
180
79
337
323
37
602
166
85
361
172
34
118
173
97
163
859
59
196
526
391
99
207
33
162
307
534
252
508
141
522
135
47
100
201
750
58
497
207
531
180
44
188
503
148
419
223
120
61
959
402
108
274
193
304
62
208
111
356
672
42
205
39
32
113
310
117
115
39
924
515
736
289
105
295
1733
40
159
126
153
367
91
74
685
75
41
63
500
491
302
85


In [5]:
labels = train_dataset['labels']

all_len = 0

for label in labels:
    all_len += len(label)
    
all_len

664120

## select token

In [65]:
import torch
import numpy as np
from datasets import load_dataset
dataset_name='random'
losses_pre = torch.load(f"token_losses_{dataset_name}_base.pt")
losses_new = torch.load(f"token_losses_{dataset_name}_test.pt")

start=1000
length=1000
loss_diff = []
loss_HL_prop = []
select_tokens_indices = []
for loss1, loss2 in zip(losses_pre[start:start+length], losses_new[start:start+length]):
    # print(f"shape1: {len(loss1)}; shape2: {len(loss2)}")
    diff = np.array(loss1)-np.array(loss2)
    loss_diff.append(diff)
    _, indices = torch.topk(torch.tensor(diff), k=len(diff)//2)
    select_tokens_indices.append((indices + 1).tolist()) ## indices +1 represents the biased value, which match the real token in the original dataset
    loss_HL_prop.append(round(np.sum(diff>0)/len(diff) * 100, 3))
    
# dataset = load_dataset("json", data_files="selected_data/meta-llama/Meta-Llama-3.1-8B-Instruct/all_train/random_dataset.json")
train_dataset['labels']

new_labels=[]
for selected_indices, label in zip(select_tokens_indices, train_dataset['labels'][start:start+length]):
    # print(f"selected indices: {len(selected_indices)};; label: {len(label)}")
    new_label = [-100] * len(label)
    for idx in sorted(selected_indices):
        new_label[idx] = label[idx]
    new_labels.append(new_label)
    


  losses_pre = torch.load(f"token_losses_{dataset_name}_base.pt")
  losses_new = torch.load(f"token_losses_{dataset_name}_test.pt")


In [66]:
train_dataset['labels'][start:start+length] = new_labels ##need to determine how to convert to labels to the dataset

## Split data into several subsets for multiple epoch running 

In [3]:
import json
from datasets import load_dataset

data_path = 'selected_data/'

dataset_name = 'filtered-cured'

dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']

data_size = 2000
subset_size = len(dataset) // data_size


for i in range(subset_size):
    selected_indices = [idx for idx in range(data_size *i, data_size * (i+1))]
    subset = dataset.select(selected_indices)
    subset.to_json(data_path + f"{dataset_name}-iter_{i}.json")

Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 24.87ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 41.79ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 40.96ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 40.89ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 42.01ba/s]


## global level top-k data selection

In [3]:
import json
from datasets import load_dataset
import random
import numpy as np

class TemporarilySeededRandom:
    def __init__(self, seed):
        """Temporarily set the random seed, and then restore it when exiting the context."""
        self.seed = seed
        self.stored_state = None
        self.stored_np_state = None

    def __enter__(self):
        # Store the current random state
        self.stored_state = random.getstate()
        self.stored_np_state = np.random.get_state()

        # Set the random seed
        random.seed(self.seed)
        np.random.seed(self.seed)

    def __exit__(self, exc_type, exc_value, traceback):
        # Restore the random state
        random.setstate(self.stored_state)
        np.random.set_state(self.stored_np_state)



data_path = 'selected_data/'

# dataset_name = 'random' #'filtered-cured'
dataset_name = 'filtered-cured-50k'#'filtered-cured'

train_dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']


### num_iters
num_iters=10

subset_size = int(len(train_dataset) * 0.01)

for idx in range(num_iters):
    
    if idx % 2 == 1:        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*6, replace=False)
            
        subset = train_dataset.select(random_indices)
        
    else: ## for all token selection with subset
        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*2, replace=False)
            
        subset = train_dataset.select(random_indices)


    # subset = dataset.select(selected_indices)
    subset.to_json(data_path + f"{dataset_name}-all-iter-global-subset-small-new_{idx}.json")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 17.97ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 17.72ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 21.30ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 17.79ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 15.81ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 21.56ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 19.41ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 17.81ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 22.47ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 17.25ba/s]


### Non-iteration form

In [1]:
import json
from datasets import load_dataset
import random
import numpy as np

class TemporarilySeededRandom:
    def __init__(self, seed):
        """Temporarily set the random seed, and then restore it when exiting the context."""
        self.seed = seed
        self.stored_state = None
        self.stored_np_state = None

    def __enter__(self):
        # Store the current random state
        self.stored_state = random.getstate()
        self.stored_np_state = np.random.get_state()

        # Set the random seed
        random.seed(self.seed)
        np.random.seed(self.seed)

    def __exit__(self, exc_type, exc_value, traceback):
        # Restore the random state
        random.setstate(self.stored_state)
        np.random.set_state(self.stored_np_state)



data_path = 'selected_data/'

# dataset_name = 'random' #'filtered-cured'
dataset_name = 'filtered-cured-50k'#'filtered-cured'

train_dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']


### num_iters
num_iters=5

subset_size = int(len(train_dataset) * 0.01)

for idx in range(num_iters):
    
    if idx > 0:        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*5, replace=False)
            
        subset = train_dataset.select(random_indices)
        
    else: ## for all token selection with subset
        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*5, replace=False)
            
        subset = train_dataset.select(random_indices)


    # subset = dataset.select(selected_indices)
    subset.to_json(data_path + f"{dataset_name}-all-global-subset-new_{idx}.json")

  from .autonotebook import tqdm as notebook_tqdm
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.82ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 22.11ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 20.58ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 21.40ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 20.61ba/s]
