In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from functools import partial

def encode_with_prompt_completion_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has 'prompt' and 'completion' fields.
    We concatenate prompt and completion and tokenize them together because otherwise prompt will be padded/trancated 
    and it doesn't make sense to follow directly with the completion.
    '''
    # if prompt doesn't end with space and completion doesn't start with space, add space
    if not example['prompt'].endswith((' ', '\n', '\t')) and not example['completion'].startswith((' ', '\n', '\t')):
        example_text = example['prompt'] + ' ' + example['completion']
    else:
        example_text = example['prompt'] + example['completion']
    example_text = example_text + tokenizer.eos_token
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()
    tokenized_prompt = tokenizer(example['prompt'], return_tensors='pt', max_length=max_seq_length, truncation=True)
    # mask the prompt part for avoiding loss
    # labels[:, :tokenized_prompt.input_ids.shape[1]] = -100
    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }


def encode_with_messages_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has a 'messages' field Each message is a dict with 'role' and 'content' fields.
    We concatenate all messages with the roles as delimiters and tokenize them together.
    '''
    messages = example['messages']
    if len(messages) == 0:
        raise ValueError('messages field is empty.')
    
    def _concat_messages(messages):
        message_text = ""
        for message in messages:
            if message["role"] == "system":
                message_text += "<|system|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "user":
                message_text += "<|user|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "assistant":
                message_text += "<|assistant|>\n" + message["content"].strip() + tokenizer.eos_token + "\n"
            else:
                raise ValueError("Invalid role: {}".format(message["role"]))
        return message_text
        
    example_text = _concat_messages(messages).strip()
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()

    # mask the non-assistant part for avoiding loss
    for message_idx, message in enumerate(messages):
        if message["role"] != "assistant":
            if message_idx == 0:
                message_start_idx = 0
            else:
                message_start_idx = tokenizer(
                    _concat_messages(messages[:message_idx]), return_tensors='pt', max_length=max_seq_length, truncation=True
                ).input_ids.shape[1]
            if message_idx < len(messages) - 1 and messages[message_idx+1]["role"] == "assistant":
                # here we also ignore the role of the assistant
                messages_so_far = _concat_messages(messages[:message_idx+1]) + "<|assistant|>\n"
            else:
                messages_so_far = _concat_messages(messages[:message_idx+1])
            message_end_idx = tokenizer(
                messages_so_far,
                return_tensors='pt', 
                max_length=max_seq_length, 
                truncation=True
            ).input_ids.shape[1]
            # labels[:, message_start_idx:message_end_idx] = -100

            if message_end_idx >= max_seq_length:
                break

    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

dataset_name='test_dataset'
model_name_or_path = "meta-llama/Llama-3.2-3B"
data_path = f"selected_data/{dataset_name}.json"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
raw_dataset = load_dataset("json", data_files=data_path)

if "prompt" in raw_dataset["train"].column_names and "completion" in raw_dataset["train"].column_names:
    encode_function = partial(
        encode_with_prompt_completion_format,
        tokenizer=tokenizer,
        max_seq_length= 2048,
        add_bos= False,
    )
elif "messages" in raw_dataset["train"].column_names:
    encode_function = partial(
        encode_with_messages_format,
        tokenizer=tokenizer,
        max_seq_length= 2048,
        add_bos= False,
    )
    
raw_dataset = raw_dataset.map(
    lambda example, idx: {"idx": idx},
    with_indices=True,  
    desc="Adding idx column",
)
        

lm_datasets = raw_dataset.map(
    encode_function,
    batched=False,
    # remove_columns=[name for name in raw_dataset["train"].column_names if name not in ["idx", "input_ids", "labels", "attention_mask"]],
    desc="Tokenizing and reformatting instruction data",
)

train_dataset = lm_datasets['train']

Adding idx column: 100%|██████████| 1000/1000 [00:00<00:00, 17192.73 examples/s]
Tokenizing and reformatting instruction data: 100%|██████████| 1000/1000 [00:02<00:00, 478.62 examples/s]


In [4]:
train_dataset

Dataset({
    features: ['dataset', 'id', 'messages', 'idx', 'input_ids', 'labels', 'attention_mask'],
    num_rows: 1000
})

## Split data into several subsets for multiple epoch running 

In [4]:
import json
from datasets import load_dataset

data_path = 'selected_data/'

# dataset_name = 'filtered-cured-50k'
dataset_name = "random_subset_50k"
# dataset_name = "alpaca_52k"
# dataset_name = "full"
# dataset_name = "filtered-cured-10k"
dataset_name = "test_100"

dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']


subset_size = 5

data_size = len(dataset) // subset_size
# data_size = 10

for i in range(subset_size):
    selected_indices = [idx for idx in range(data_size *i, data_size * (i+1))]
    subset = dataset.select(selected_indices)
    subset.to_json(data_path + f"{dataset_name}-iter-split-global-curve-positive-new_{i}.json")
    
    

Generating train split: 100 examples [00:00, 24466.57 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 669.59ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 803.66ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 672.92ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 705.87ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 823.06ba/s]


In [1]:
import json
from datasets import load_dataset

data_path = 'selected_data/'
dataset_name = 'filtered-cured-50k'
exp_tag = "non-iter-split-global-new-randtok"

dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']

first_subset_size = 5000  # 第一个子集的大小

# 计算剩余数据的大小
remaining_data_size = len(dataset) - first_subset_size
num_iter = 4
subset_size = remaining_data_size // num_iter  # 将剩余数据平分为剩余的子集数量
print(subset_size)
# 第一个子集
first_subset = dataset.select(range(first_subset_size))
first_subset.to_json(data_path + f"{dataset_name}-{exp_tag}_0.json")

# 后续的子集
for i in range(num_iter):
    start_idx = first_subset_size + i * subset_size
    end_idx = start_idx + subset_size
    selected_indices = list(range(start_idx, end_idx))
    subset = dataset.select(selected_indices)
    subset.to_json(data_path + f"{dataset_name}-{exp_tag}_{i+1}.json")


11250


Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

In [3]:
import json
from datasets import load_dataset
import random
import numpy as np

class TemporarilySeededRandom:
    def __init__(self, seed):
        """Temporarily set the random seed, and then restore it when exiting the context."""
        self.seed = seed
        self.stored_state = None
        self.stored_np_state = None

    def __enter__(self):
        # Store the current random state
        self.stored_state = random.getstate()
        self.stored_np_state = np.random.get_state()

        # Set the random seed
        random.seed(self.seed)
        np.random.seed(self.seed)

    def __exit__(self, exc_type, exc_value, traceback):
        # Restore the random state
        random.setstate(self.stored_state)
        np.random.set_state(self.stored_np_state)



data_path = 'selected_data/'

# dataset_name = 'random' #'filtered-cured'
dataset_name = 'filtered-cured-50k'#'filtered-cured'

train_dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']


### num_iters
num_iters=5

subset_size = int(len(train_dataset) * 0.01)

for idx in range(num_iters):
    
    if idx > 0:        
        # with TemporarilySeededRandom(idx * 10086):
        #     random_indices = np.random.choice(len(train_dataset), size=subset_size*6, replace=False)
        # subset = train_dataset.select(random_indices)
        
        subset = train_dataset        

    else: ## for all token selection with subset
        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*2, replace=False)
            
        subset = train_dataset.select(random_indices)


    # subset = dataset.select(selected_indices)
    # subset.to_json(data_path + f"{dataset_name}-all-non-iter-sample-subset-new_{idx}.json")
    subset.to_json(data_path + f"{dataset_name}-all-non-iter-global_{idx}.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

## global level top-k data selection

In [1]:
import json
from datasets import load_dataset
import random
import numpy as np

class TemporarilySeededRandom:
    def __init__(self, seed):
        """Temporarily set the random seed, and then restore it when exiting the context."""
        self.seed = seed
        self.stored_state = None
        self.stored_np_state = None

    def __enter__(self):
        # Store the current random state
        self.stored_state = random.getstate()
        self.stored_np_state = np.random.get_state()

        # Set the random seed
        random.seed(self.seed)
        np.random.seed(self.seed)

    def __exit__(self, exc_type, exc_value, traceback):
        # Restore the random state
        random.setstate(self.stored_state)
        np.random.set_state(self.stored_np_state)



data_path = 'selected_data/'

# dataset_name = 'random' #'filtered-cured'
dataset_name = 'filtered-cured-50k'#'filtered-cured'

train_dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']

data_type_tag='combine_loss' ##global sample union additional_two_tokens intersection
### num_iters
num_iters=10

subset_size = int(len(train_dataset) * 0.01)

for idx in range(num_iters):
    
    if idx % 2 == 1:        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*6, replace=False)
            
        subset = train_dataset.select(random_indices)
        
    else: ## for all token selection with subset
        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*2, replace=False)
            
        subset = train_dataset.select(random_indices)


    # subset = dataset.select(selected_indices)
    subset.to_json(data_path + f"{dataset_name}-all-iter-{data_type_tag}-subset-small-new_{idx}.json")

Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

### Non-iteration form

In [2]:
import json
from datasets import load_dataset
import random
import numpy as np

class TemporarilySeededRandom:
    def __init__(self, seed):
        """Temporarily set the random seed, and then restore it when exiting the context."""
        self.seed = seed
        self.stored_state = None
        self.stored_np_state = None

    def __enter__(self):
        # Store the current random state
        self.stored_state = random.getstate()
        self.stored_np_state = np.random.get_state()

        # Set the random seed
        random.seed(self.seed)
        np.random.seed(self.seed)

    def __exit__(self, exc_type, exc_value, traceback):
        # Restore the random state
        random.setstate(self.stored_state)
        np.random.set_state(self.stored_np_state)



data_path = 'selected_data/'

# dataset_name = 'random' #'filtered-cured'
dataset_name = 'filtered-cured-50k'#'filtered-cured'

train_dataset = load_dataset("json", data_files= data_path + f'{dataset_name}_dataset.json')['train']


### num_iters
num_iters=5

subset_size = int(len(train_dataset) * 0.01)

for idx in range(num_iters):
    
    if idx > 0:        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*6, replace=False)
            
        subset = train_dataset.select(random_indices)
        
    else: ## for all token selection with subset
        
        with TemporarilySeededRandom(idx * 10086):
            random_indices = np.random.choice(len(train_dataset), size=subset_size*2, replace=False)
            
        subset = train_dataset.select(random_indices)


    # subset = dataset.select(selected_indices)
    subset.to_json(data_path + f"{dataset_name}-all-non-iter-sample-subset-new_{idx}.json")
    

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

# Print the text of selected-token (text)



In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from functools import partial
import os
from termcolor import colored
from tqdm import tqdm 

def encode_with_prompt_completion_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has 'prompt' and 'completion' fields.
    We concatenate prompt and completion and tokenize them together because otherwise prompt will be padded/trancated 
    and it doesn't make sense to follow directly with the completion.
    '''
    # if prompt doesn't end with space and completion doesn't start with space, add space
    if not example['prompt'].endswith((' ', '\n', '\t')) and not example['completion'].startswith((' ', '\n', '\t')):
        example_text = example['prompt'] + ' ' + example['completion']
    else:
        example_text = example['prompt'] + example['completion']
    example_text = example_text + tokenizer.eos_token
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()
    tokenized_prompt = tokenizer(example['prompt'], return_tensors='pt', max_length=max_seq_length, truncation=True)
    # mask the prompt part for avoiding loss
    # labels[:, :tokenized_prompt.input_ids.shape[1]] = -100
    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

def encode_with_messages_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has a 'messages' field Each message is a dict with 'role' and 'content' fields.
    We concatenate all messages with the roles as delimiters and tokenize them together.
    '''
    messages = example['messages']
    if len(messages) == 0:
        raise ValueError('messages field is empty.')
    
    def _concat_messages(messages):
        message_text = ""
        for message in messages:
            if message["role"] == "system":
                message_text += "<|system|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "user":
                message_text += "<|user|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "assistant":
                message_text += "<|assistant|>\n" + message["content"].strip() + tokenizer.eos_token + "\n"
            else:
                raise ValueError("Invalid role: {}".format(message["role"]))
        return message_text
        
    example_text = _concat_messages(messages).strip()
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()

    # mask the non-assistant part for avoiding loss
    for message_idx, message in enumerate(messages):
        if message["role"] != "assistant":
            if message_idx == 0:
                message_start_idx = 0
            else:
                message_start_idx = tokenizer(
                    _concat_messages(messages[:message_idx]), return_tensors='pt', max_length=max_seq_length, truncation=True
                ).input_ids.shape[1]
            if message_idx < len(messages) - 1 and messages[message_idx+1]["role"] == "assistant":
                # here we also ignore the role of the assistant
                messages_so_far = _concat_messages(messages[:message_idx+1]) + "<|assistant|>\n"
            else:
                messages_so_far = _concat_messages(messages[:message_idx+1])
            message_end_idx = tokenizer(
                messages_so_far,
                return_tensors='pt', 
                max_length=max_seq_length, 
                truncation=True
            ).input_ids.shape[1]
            # labels[:, message_start_idx:message_end_idx] = -100

            if message_end_idx >= max_seq_length:
                break

    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

def encode_data(raw_dataset):
    
    if "prompt" in raw_dataset["train"].column_names and "completion" in raw_dataset["train"].column_names:
        encode_function = partial(
            encode_with_prompt_completion_format,
            tokenizer=tokenizer,
            max_seq_length= 2048,
            add_bos= False,
        )
    elif "messages" in raw_dataset["train"].column_names:
        encode_function = partial(
            encode_with_messages_format,
            tokenizer=tokenizer,
            max_seq_length= 2048,
            add_bos= False,
        )
        
            
    lm_datasets = raw_dataset.map(
        encode_function,
        batched=False,
        # remove_columns=[name for name in raw_dataset["train"].column_names if name not in ["idx", "input_ids", "labels", "attention_mask"]],
        desc="Tokenizing and reformatting instruction data",
    )

    return lm_datasets['train']['labels']
    
###############################################

model_name_or_path = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

train_tag='sample'

print(f"#### Current Token Selection Pattern: {train_tag}-level #####")

label_tag=f'token_labels_filtered-cured-50k-all-iter-{train_tag}-subset-small-new_'
data_tag=f"filtered-cured-50k-all-iter-{train_tag}-subset-small-new_"

label_path = 'results/label/'
data_path = f"selected_data/"

text_all_files = {}
for idx in range(1,2):
    cur_model_label = label_tag + str(idx) + '.pt'
    cur_orig_data = data_tag + str(idx) + '.json'
    
    raw_dataset = load_dataset("json", data_files= data_path + cur_orig_data)
    
    orig_labels_all = encode_data(raw_dataset)

    cur_labels_all = torch.load(label_path + cur_model_label, weights_only=False)
    
    text_all = []
    count = 20

    for cur_labels, orig_labels in tqdm(zip(cur_labels_all, orig_labels_all), desc="handing each sample"):
        text = []
        for cur_label, orig_label in zip(cur_labels, orig_labels):
            
            token = tokenizer.decode([orig_label], skip_special_tokens=True)  
            if cur_label != -100:  # highlight 
                text.append(colored(token, 'red'))
            else:
                text.append(token)
                
                
        text_single = "".join(text)
        
        if count > 0:
            print('#' * 200 + '\n')
            print(text_single)
        count -= 1
        
        text_all.append(text_single)
        
    text_all_files[idx] = text_all
    

#### Current Token Selection Pattern: sample-level #####


handing each sample: 45it [00:00, 216.09it/s]

########################################################################################################################################################################################################

[31m<[0m[31m|[0m[31muser[0m[31m|[0m[31m>
[0m[31mHow[0m[31m can[0m I[31m use[0m Ruby[31m code[0m[31m to[0m scrape[31m data[0m for my[31m lecture[0m[31m on[0m[31m global[0m[31m warming[0m[31m?[0m[31m What[0m libraries[31m and[0m[31m methods[0m[31m can[0m[31m I[0m use[31m to[0m[31m extract[0m[31m relevant[0m[31m information[0m[31m from[0m[31m sources[0m[31m such[0m as[31m scientific[0m[31m journals[0m[31m and[0m[31m news[0m[31m articles[0m[31m?[0m And[31m how[0m[31m can[0m[31m I[0m use this[31m data[0m[31m to[0m[31m create[0m compelling visualizations[31m and[0m[31m graphs[0m[31m to[0m[31m engage[0m[31m my[0m[31m audience[0m?[31m Additionally[0m, what[31m Ruby[0m[31m code[0m[31m can[0m[31m I

handing each sample: 3000it [00:14, 203.23it/s]


## Check the all tokens selected by global or sample-level

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from functools import partial
import os
from termcolor import colored
from tqdm import tqdm 

def encode_with_prompt_completion_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has 'prompt' and 'completion' fields.
    We concatenate prompt and completion and tokenize them together because otherwise prompt will be padded/trancated 
    and it doesn't make sense to follow directly with the completion.
    '''
    # if prompt doesn't end with space and completion doesn't start with space, add space
    if not example['prompt'].endswith((' ', '\n', '\t')) and not example['completion'].startswith((' ', '\n', '\t')):
        example_text = example['prompt'] + ' ' + example['completion']
    else:
        example_text = example['prompt'] + example['completion']
    example_text = example_text + tokenizer.eos_token
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()
    tokenized_prompt = tokenizer(example['prompt'], return_tensors='pt', max_length=max_seq_length, truncation=True)
    # mask the prompt part for avoiding loss
    # labels[:, :tokenized_prompt.input_ids.shape[1]] = -100
    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

def encode_with_messages_format(example, tokenizer, max_seq_length, add_bos=False):
    '''
    Here we assume each example has a 'messages' field Each message is a dict with 'role' and 'content' fields.
    We concatenate all messages with the roles as delimiters and tokenize them together.
    '''
    messages = example['messages']
    if len(messages) == 0:
        raise ValueError('messages field is empty.')
    
    def _concat_messages(messages):
        message_text = ""
        for message in messages:
            if message["role"] == "system":
                message_text += "<|system|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "user":
                message_text += "<|user|>\n" + message["content"].strip() + "\n"
            elif message["role"] == "assistant":
                message_text += "<|assistant|>\n" + message["content"].strip() + tokenizer.eos_token + "\n"
            else:
                raise ValueError("Invalid role: {}".format(message["role"]))
        return message_text
        
    example_text = _concat_messages(messages).strip()
    if add_bos:
        example_text = tokenizer.bos_token + example_text
    tokenized_example = tokenizer(example_text, return_tensors='pt', max_length=max_seq_length, truncation=True)
    input_ids = tokenized_example.input_ids
    labels = input_ids.clone()

    # mask the non-assistant part for avoiding loss
    for message_idx, message in enumerate(messages):
        if message["role"] != "assistant":
            if message_idx == 0:
                message_start_idx = 0
            else:
                message_start_idx = tokenizer(
                    _concat_messages(messages[:message_idx]), return_tensors='pt', max_length=max_seq_length, truncation=True
                ).input_ids.shape[1]
            if message_idx < len(messages) - 1 and messages[message_idx+1]["role"] == "assistant":
                # here we also ignore the role of the assistant
                messages_so_far = _concat_messages(messages[:message_idx+1]) + "<|assistant|>\n"
            else:
                messages_so_far = _concat_messages(messages[:message_idx+1])
            message_end_idx = tokenizer(
                messages_so_far,
                return_tensors='pt', 
                max_length=max_seq_length, 
                truncation=True
            ).input_ids.shape[1]
            # labels[:, message_start_idx:message_end_idx] = -100

            if message_end_idx >= max_seq_length:
                break

    attention_mask = torch.ones_like(input_ids)
    return {
        'input_ids': input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': attention_mask.flatten(),
    }

def encode_data(raw_dataset):
    
    if "prompt" in raw_dataset["train"].column_names and "completion" in raw_dataset["train"].column_names:
        encode_function = partial(
            encode_with_prompt_completion_format,
            tokenizer=tokenizer,
            max_seq_length= 2048,
            add_bos= False,
        )
    elif "messages" in raw_dataset["train"].column_names:
        encode_function = partial(
            encode_with_messages_format,
            tokenizer=tokenizer,
            max_seq_length= 2048,
            add_bos= False,
        )
        
            
    lm_datasets = raw_dataset.map(
        encode_function,
        batched=False,
        # remove_columns=[name for name in raw_dataset["train"].column_names if name not in ["idx", "input_ids", "labels", "attention_mask"]],
        desc="Tokenizing and reformatting instruction data",
    )

    return lm_datasets['train']['labels']
    
    
    
def counting_labels(labels_all, special_token=-100):
    all_counts = 0
    selected_count = 0
    for labels in labels_all:
        for label in labels:
            if label != -100:
                selected_count +=1
                
        all_counts += len(labels)
    return selected_count, all_counts

###############################################

model_name_or_path = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

label_path = 'results/label/'
data_path = f"selected_data/"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
text_all_files = {}

for idx in range(1,9):
    
    #original labels
    data_tag=f"filtered-cured-50k-all-iter-global-subset-small-new_"
    cur_orig_data = data_tag + str(idx) + '.json'
    raw_dataset = load_dataset("json", data_files= data_path + cur_orig_data)
    orig_labels_all = encode_data(raw_dataset)


    labels_all_levels = {}
    
    for train_tag in ['global', 'sample']:
        
        # print(f"#### Current Token Selection Pattern: {train_tag}-level #####")
        label_tag=f'token_labels_filtered-cured-50k-all-iter-{train_tag}-subset-small-new_'
        cur_model_label = label_tag + str(idx) + '.pt'
        cur_labels_all = torch.load(label_path + cur_model_label, weights_only=False)

        labels_all_levels[train_tag] = cur_labels_all

    intersection_labels_all = [] ## the same selected labels
    union_labels_all = [] ## the labels selected by global or sample
    
    for global_labels, sample_labels in zip(labels_all_levels['global'], labels_all_levels['sample']):
        intersection_labels = [-100] * len(global_labels)
        union_labels = [-100] * len(global_labels)

        for i, (global_label, sample_label) in enumerate(zip(global_labels, sample_labels)):
            if global_label != -100 or sample_label != -100:
                selected_label = global_label if global_label != -100 else sample_label
                union_labels[i] = selected_label
                
            if global_label != -100 and sample_label != -100: 
                if global_label == sample_label:               
                    intersection_labels[i] = sample_label
        
        intersection_labels_all.append(intersection_labels)
        union_labels_all.append(union_labels)
        
    text_all_files[idx] = {
        'intersection_labels': intersection_labels_all,
        'union_labels': union_labels_all
    }
    
    
    ### compute the proportion of labels
    for key, item in text_all_files[idx].items():
        
        selected_count, all_counts = counting_labels(item)
        print(f"dataset {idx}-th file:: ### {key} ### label proportion::  {round(selected_count/all_counts * 100, 2)} %")


    

dataset 1-th file:: ### intersection_labels ### label proportion::  26.16 %
dataset 1-th file:: ### union_labels ### label proportion::  33.69 %


Tokenizing and reformatting instruction data:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataset 2-th file:: ### intersection_labels ### label proportion::  26.09 %
dataset 2-th file:: ### union_labels ### label proportion::  33.76 %


Tokenizing and reformatting instruction data:   0%|          | 0/3000 [00:00<?, ? examples/s]

dataset 3-th file:: ### intersection_labels ### label proportion::  26.4 %
dataset 3-th file:: ### union_labels ### label proportion::  33.44 %


Tokenizing and reformatting instruction data:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataset 4-th file:: ### intersection_labels ### label proportion::  26.36 %
dataset 4-th file:: ### union_labels ### label proportion::  33.48 %


Tokenizing and reformatting instruction data:   0%|          | 0/3000 [00:00<?, ? examples/s]

dataset 5-th file:: ### intersection_labels ### label proportion::  26.53 %
dataset 5-th file:: ### union_labels ### label proportion::  33.32 %


Tokenizing and reformatting instruction data:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataset 6-th file:: ### intersection_labels ### label proportion::  26.35 %
dataset 6-th file:: ### union_labels ### label proportion::  33.5 %


Tokenizing and reformatting instruction data:   0%|          | 0/3000 [00:00<?, ? examples/s]

dataset 7-th file:: ### intersection_labels ### label proportion::  26.44 %
dataset 7-th file:: ### union_labels ### label proportion::  33.41 %


Tokenizing and reformatting instruction data:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataset 8-th file:: ### intersection_labels ### label proportion::  26.44 %
dataset 8-th file:: ### union_labels ### label proportion::  33.41 %


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer


model_name_or_path= "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 4/4 [06:21<00:00, 95.49s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it]


In [4]:
import os
path_name="meta-llama/Llama-3.1-8B-Instruct.json"
path = os.path.basename(path_name).split(".json")[0]
path


'Llama-3.1-8B-Instruct'

In [4]:
import torch

labels = torch.load("results/label/token_labels_filtered-cured-50k-active-split_0.pt")

  labels = torch.load("results/label/token_labels_filtered-cured-50k-active-split_0.pt")


In [5]:
len(labels)

1000

In [5]:
from datasets import load_dataset

dataset = load_dataset("allenai/tulu-v2-sft-mixture")['train']

In [8]:
set(dataset['dataset'])

{'code_alpaca',
 'cot',
 'flan_v2',
 'gpt4_alpaca',
 'hard_coded',
 'lima',
 'oasst1',
 'open_orca',
 'science.evidence_inference',
 'science.qasper_truncated_4000',
 'science.scierc_ner',
 'science.scierc_relation',
 'science.scifact_json',
 'science.scitldr_aic',
 'sharegpt',
 'wizardlm'}

In [10]:
# 过滤出 dataset 列中值为 'alpaca_eval' 的数据
alpaca_eval_data = dataset.filter(lambda x: x['dataset'] == 'code_alpaca')

# 输出过滤后的数据集大小
print("Filtered dataset size:", len(alpaca_eval_data))


Filter: 100%|██████████| 326154/326154 [00:07<00:00, 42758.00 examples/s]

Filtered dataset size: 20016



