## filtered 1: solely ranking based on long-tail score

In [3]:
import torch 
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

random.seed(42)

dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name="gpt-4o-mini"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"
dataset_size=10000

## label curation reports
report_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/labeling/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"
reports = torch.load(report_path)


'''Part 1 (label-wise): label curation'''
### choose the data index that needed to be remove
corrupted_samples = [x[0] for x in reports.detection['label_error']]

##  samples that can be cured
cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']: ##(idx, label, confidence)
    if sample[2] >= 0: #confidence prob;0.75
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))



print(f"cured sample size: {len(cured_sample_labels)}")


#filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"corrupted_samples_total: {len(corrupted_samples_total)}")


# change the original labels to the suggested label
root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

labels = torch.load(root_path + "output_labels_revised.pt")


print(f"Original Counter(labels): {Counter(labels)}")


for sample_label in cured_sample_labels:
    labels[sample_label[0]] = sample_label[1]
print(f"label size: {len(labels)}")

## select high-quality samples based on the quality labels
print(f"Revised Counter(labels): {Counter(labels)}")



###filter out the low-quality samples

low_quality_label_idx = []
# for idx, label in enumerate(labels):
#     ######################## select labels  ########################
#     if label<4: 
#         low_quality_label_idx.append(idx)
#     # elif label == 3 and random.random() >= 0.5:
#     #     low_quality_label_idx.append(idx)


label_wise_filter_out_samples = set(low_quality_label_idx + corrupted_samples_total)


print(f"label_wise_filter_out_samples: {len(label_wise_filter_out_samples)}")

'''Part-2 (feature-wise): handle the rare example'''

rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example'])//2]
# rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples if sample[0] not in label_wise_filter_out_samples] 
rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples] 

# rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples if sample[0] not in set(label_wise_filter_out_samples)] 


print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

long_tail_scores = np.array(rare_samples_filtered)[:,1]


# 根据第二个值降序排序
sorted_samples = sorted(rare_samples_filtered, key=lambda x: x[1], reverse=True)

# 只保留排序后的 x[0]
remaining_samples_indices = [x[0] for x in sorted_samples][:dataset_size]


remaining_samples_idx = np.array(rare_samples_filtered, dtype=int)[remaining_samples_indices, 0]
remaining_samples_idx_2 = remaining_samples_idx
# long_tail_scores_filtered = long_tail_scores[remaining_samples_idx]
long_tail_scores_filtered = np.array(rare_samples_filtered)[remaining_samples_indices, 1]

print("Size of the filtered dataset:", len(remaining_samples_idx))

'''filter out the corrupted samples and reconstruct the dataset'''


data = load_dataset('json', data_files=root_path + 'full_dataset.json')



# filtered_dialogs = data['train'].select(remaining_samples_idx).shuffle(seed=42)
filtered_dialogs = data['train'].select(remaining_samples_idx)


filtered_labels = np.array(labels)[remaining_samples_idx].tolist()

assert len(filtered_dialogs) == len(filtered_labels)


filtered_dialogs.to_json(root_path + f"filtered_1_dataset.json")


cured sample size: 187883
corrupted_samples_total: 0
Original Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
label size: 300932
Revised Counter(labels): Counter({3: 169656, 4: 52096, 1: 35698, 2: 25501, 0: 17884, 5: 97})
label_wise_filter_out_samples: 0
Size of the remaining samples with high quality: 300932
Size of the filtered dataset: 10000


Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 41.61ba/s]


19175072

### filtered 2: long-tail score * label

In [4]:
import torch 
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

random.seed(42)

dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name="gpt-4o-mini"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"
dataset_size=10000

## label curation reports
report_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/labeling/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"
reports = torch.load(report_path)


'''Part 1 (label-wise): label curation'''
### choose the data index that needed to be remove
corrupted_samples = [x[0] for x in reports.detection['label_error']]

##  samples that can be cured
cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']: ##(idx, label, confidence)
    if sample[2] >= 0: #confidence prob;0.75
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))



print(f"cured sample size: {len(cured_sample_labels)}")


#filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"corrupted_samples_total: {len(corrupted_samples_total)}")


# change the original labels to the suggested label
root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

labels = torch.load(root_path + "output_labels_revised.pt")


print(f"Original Counter(labels): {Counter(labels)}")


for sample_label in cured_sample_labels:
    labels[sample_label[0]] = sample_label[1]
print(f"label size: {len(labels)}")

## select high-quality samples based on the quality labels
print(f"Revised Counter(labels): {Counter(labels)}")



###filter out the low-quality samples

low_quality_label_idx = []
# for idx, label in enumerate(labels):
#     ######################## select labels  ########################
#     if label<4: 
#         low_quality_label_idx.append(idx)
#     # elif label == 3 and random.random() >= 0.5:
#     #     low_quality_label_idx.append(idx)


label_wise_filter_out_samples = set(low_quality_label_idx + corrupted_samples_total)


print(f"label_wise_filter_out_samples: {len(label_wise_filter_out_samples)}")

'''Part-2 (feature-wise): handle the rare example'''

rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example'])//2]
# rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples if sample[0] not in label_wise_filter_out_samples] 


rare_samples_filtered = [[sample[0], sample[1] * labels[sample[0]]] for sample in rare_samples] 




print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

long_tail_scores = np.array(rare_samples_filtered)[:,1]


# 根据第二个值降序排序
sorted_samples = sorted(rare_samples_filtered, key=lambda x: x[1], reverse=True)

# 只保留排序后的 x[0]
remaining_samples_indices = [x[0] for x in sorted_samples][:dataset_size]


remaining_samples_idx = np.array(rare_samples_filtered, dtype=int)[remaining_samples_indices, 0]
remaining_samples_idx_2 = remaining_samples_idx
# long_tail_scores_filtered = long_tail_scores[remaining_samples_idx]
long_tail_scores_filtered = np.array(rare_samples_filtered)[remaining_samples_indices, 1]

print("Size of the filtered dataset:", len(remaining_samples_idx))

'''filter out the corrupted samples and reconstruct the dataset'''


data = load_dataset('json', data_files=root_path + 'full_dataset.json')



# filtered_dialogs = data['train'].select(remaining_samples_idx).shuffle(seed=42)
filtered_dialogs = data['train'].select(remaining_samples_idx)


filtered_labels = np.array(labels)[remaining_samples_idx].tolist()

assert len(filtered_dialogs) == len(filtered_labels)


filtered_dialogs.to_json(root_path + f"filtered_2_dataset.json")


cured sample size: 187883
corrupted_samples_total: 0
Original Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
label size: 300932
Revised Counter(labels): Counter({3: 169656, 4: 52096, 1: 35698, 2: 25501, 0: 17884, 5: 97})
label_wise_filter_out_samples: 0
Size of the remaining samples with high quality: 300932
Size of the filtered dataset: 10000


Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.51ba/s]


25566662

### filtered 3: reduce the bin scale 

In [19]:
import torch 
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

random.seed(42)

dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name="gpt-4o-mini"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"


## label curation reports
report_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/labeling/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"
reports = torch.load(report_path)


'''Part 1 (label-wise): label curation'''
### choose the data index that needed to be remove
corrupted_samples = [x[0] for x in reports.detection['label_error']]

##  samples that can be cured
cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']: ##(idx, label, confidence)
    if sample[2] >= 0: #confidence prob;0.75
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))



print(f"cured sample size: {len(cured_sample_labels)}")


#filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"corrupted_samples_total: {len(corrupted_samples_total)}")


# change the original labels to the suggested label
root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

labels = torch.load(root_path + "output_labels_revised.pt")


print(f"Original Counter(labels): {Counter(labels)}")


for sample_label in cured_sample_labels:
    labels[sample_label[0]] = sample_label[1]
print(f"label size: {len(labels)}")

## select high-quality samples based on the quality labels
print(f"Revised Counter(labels): {Counter(labels)}")



###filter out the low-quality samples

low_quality_label_idx = []
# for idx, label in enumerate(labels):
#     ######################## select labels  ########################
#     if label<4: 
#         low_quality_label_idx.append(idx)
#     # elif label == 3 and random.random() >= 0.5:
#     #     low_quality_label_idx.append(idx)


label_wise_filter_out_samples = set(low_quality_label_idx + corrupted_samples_total)


print(f"label_wise_filter_out_samples: {len(label_wise_filter_out_samples)}")

'''Part-2 (feature-wise): handle the rare example'''

rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example'])//2]
# rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples if sample[0] not in label_wise_filter_out_samples] 
rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples] 

# rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples if sample[0] not in set(label_wise_filter_out_samples)] 


print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

long_tail_scores = np.array(rare_samples_filtered)[:,1]

bins = np.arange(0, max(long_tail_scores)+0.03, 0.03) # 定义区间边界

# 计算每个区间的计数
counts, _ = np.histogram(long_tail_scores, bins)


threshold = 2500 ## the random threshold for feature-wise

remaining_samples_indices = []


########################################################################################################################
# '''Only select the high-rated (5) samples '''
# for i in range(len(bins) - 1):
#     indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i+1]))[0]
#     # if counts[i] > threshold:
#     #     # indices_in_bin = [idx for idx in indices_in_bin if idx not in label_wise_filter_out_samples] ## only remove the wrong-annotated samples if the sample size is too much, otherwise remain them.

#     #     high_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] == 5]

#     #     # if len(high_quality_indices_in_bin) >= threshold:

#     #     #     high_quality_indices_in_bin = random.sample(list(high_quality_indices_in_bin), threshold)

#     #     remaining_samples_indices.extend(high_quality_indices_in_bin)
#     #     #     low_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] >= 3]
#     #     #     print(f"threshold - len(high_quality_indices_in_bin: {threshold - len(high_quality_indices_in_bin)};;; len_low: {len(low_quality_indices_in_bin)}")
#     #     #     low_quality_indices_in_bin = random.sample(list(low_quality_indices_in_bin), threshold - len(high_quality_indices_in_bin))
#     #     #     remaining_samples_indices.extend(high_quality_indices_in_bin + low_quality_indices_in_bin)

#     # else:
#         # 
#     high_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] == 5]

#     remaining_samples_indices.extend(high_quality_indices_in_bin)

########################################################################################################################
filter_idx=0


for i in range(len(bins) - 1):
    
    if i <filter_idx: ## remove some too similar samples

        indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i+1]))[0]

        high_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] >= 4]

        similar_sample_idxs = random.sample(list(high_quality_indices_in_bin), int(5 * 100 * bins[i]))
        print(f"#### original high-rated sample count {len(high_quality_indices_in_bin)} ;  #### selected  similar samples: {len(similar_sample_idxs)}")
        remaining_samples_indices.extend(similar_sample_idxs)
    else:
        indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i+1]))[0]
        if counts[i] > threshold:
            # indices_in_bin = [idx for idx in indices_in_bin if idx not in label_wise_filter_out_samples] ## only remove the wrong-annotated samples if the sample size is too much, otherwise remain them.

            high_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] == 5]

            if len(high_quality_indices_in_bin) >= threshold:## if the number of high-rated samples is  enough

                high_quality_indices_in_bin = random.sample(list(high_quality_indices_in_bin), threshold)
                remaining_samples_indices.extend(high_quality_indices_in_bin)

            else: ## if the number of high-rated samples is not enough
                low_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] == 4]
                print(f"threshold - len(high_quality_indices_in_bin: {threshold - len(high_quality_indices_in_bin)};;; len_low: {len(low_quality_indices_in_bin)}")
                if len(low_quality_indices_in_bin) > threshold - len(high_quality_indices_in_bin):
                    low_quality_indices_in_bin = random.sample(list(low_quality_indices_in_bin), threshold - len(high_quality_indices_in_bin))

                else:
                    low_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] >=4 and labels[rare_samples_filtered[idx][0]] !=5]

                    print(f"### the last remain sample count: {len(low_quality_indices_in_bin)}")

                remaining_samples_indices.extend(high_quality_indices_in_bin + low_quality_indices_in_bin)

        else:
            remaining_samples_indices.extend(indices_in_bin)


########################################################################################################################


# for i in range(len(bins) - 1):
#     indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i+1]))[0]
#     if counts[i] > threshold:
#         indices_in_bin = [idx for idx in indices_in_bin if idx not in label_wise_filter_out_samples] ## only remove the wrong-annotated samples if the sample size is too much, otherwise remain them.
#         high_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] >= 4]
#         low_quality_indices_in_bin = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] <= 3]
#         if len(high_quality_indices_in_bin) > threshold//2:
#             high_quality_indices_in_bin = random.sample(list(high_quality_indices_in_bin), threshold//2) 
#         low_quality_indices_in_bin = random.sample(list(low_quality_indices_in_bin), threshold //2)

#         remaining_samples_indices.extend(high_quality_indices_in_bin + low_quality_indices_in_bin)

#     else:
#         remaining_samples_indices.extend(indices_in_bin)

########################################################################################################################
# for i in range(len(bins) - 1):
#     indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i+1]))[0]
#     if counts[i] > threshold:
#         new_indices_in_bin = random.sample(list(indices_in_bin), threshold)
#         remaining_samples_indices.extend(new_indices_in_bin)

#     else:
#         remaining_samples_indices.extend(indices_in_bin)
########################################################################################################################

# label_to_indices = {i: [] for i in range(1, 6)}

# for i in range(len(bins) - 1):
#     indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i + 1]))[0]
#     if counts[i] > threshold:
#         # 将样本索引按标签分类存储到字典中
#         for idx in indices_in_bin:
#             label = labels[rare_samples_filtered[idx][0]]
#             if 1 <= label <= 5:
#                 label_to_indices[label].append(idx)
        
#         # 从每个标签中采样
#         for label in range(3, 6):
#             indices = label_to_indices[label]
#             if len(indices) >= threshold // 3:
#                 sampled_indices = random.sample(indices, threshold // 3)
#             else:
#                 sampled_indices = indices  # 如果样本数不够 threshold // 5，取全部样本
#             remaining_samples_indices.extend(sampled_indices)
            
########################################################################################################################

remaining_samples_idx = np.array(rare_samples_filtered, dtype=int)[remaining_samples_indices, 0]
remaining_samples_idx_2 = remaining_samples_idx
# long_tail_scores_filtered = long_tail_scores[remaining_samples_idx]
long_tail_scores_filtered = np.array(rare_samples_filtered)[remaining_samples_indices, 1]

print("Size of the filtered dataset:", len(remaining_samples_idx))

'''filter out the corrupted samples and reconstruct the dataset'''


data = load_dataset('json', data_files=root_path + 'full_dataset.json')



# filtered_dialogs = data['train'].select(remaining_samples_idx).shuffle(seed=42)
filtered_dialogs = data['train'].select(remaining_samples_idx)


filtered_labels = np.array(labels)[remaining_samples_idx].tolist()

assert len(filtered_dialogs) == len(filtered_labels)


filtered_dialogs.to_json(root_path + f"filtered_3_dataset.json")


cured sample size: 187883
corrupted_samples_total: 0
Original Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
label size: 300932
Revised Counter(labels): Counter({3: 169656, 4: 52096, 1: 35698, 2: 25501, 0: 17884, 5: 97})
label_wise_filter_out_samples: 0
Size of the remaining samples with high quality: 300932
threshold - len(high_quality_indices_in_bin: 2499;;; len_low: 335
### the last remain sample count: 335
threshold - len(high_quality_indices_in_bin: 2495;;; len_low: 1849
### the last remain sample count: 1849
threshold - len(high_quality_indices_in_bin: 2481;;; len_low: 12247
threshold - len(high_quality_indices_in_bin: 2450;;; len_low: 28625
threshold - len(high_quality_indices_in_bin: 2479;;; len_low: 8704
threshold - len(high_quality_indices_in_bin: 2499;;; len_low: 333
### the last remain sample count: 333
Size of the filtered dataset: 10065


Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 39.34ba/s]


26694206

## filtered 4: long-tail score * label form a distribution

In [2]:
import torch 
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

random.seed(42)

dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name="gpt-4o-mini"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"

dataset_size =10000

## label curation reports
report_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/labeling/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"
reports = torch.load(report_path)


'''Part 1 (label-wise): label curation'''
### choose the data index that needed to be remove
corrupted_samples = [x[0] for x in reports.detection['label_error']]

##  samples that can be cured
cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']: ##(idx, label, confidence)
    if sample[2] >= 1: #confidence prob;0.75
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))



print(f"cured sample size: {len(cured_sample_labels)}")


#filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"corrupted_samples_total: {len(corrupted_samples_total)}")


# change the original labels to the suggested label
root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

labels = torch.load(root_path + "output_labels_revised.pt")


print(f"Original Counter(labels): {Counter(labels)}")


for sample_label in cured_sample_labels:
    labels[sample_label[0]] = sample_label[1]
print(f"label size: {len(labels)}")

## select high-quality samples based on the quality labels
print(f"Revised Counter(labels): {Counter(labels)}")



###filter out the low-quality samples

label_wise_filter_out_samples = set(corrupted_samples_total)


print(f"label_wise_filter_out_samples: {len(label_wise_filter_out_samples)}")

'''Part-2 (feature-wise): handle the rare example'''

rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example'])//2]
# rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples if sample[0] not in label_wise_filter_out_samples] 
rare_samples_filtered = [[sample[0], sample[1] * labels[sample[0]]] for sample in rare_samples] 

# rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples if sample[0] not in set(label_wise_filter_out_samples)] 


print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

long_tail_scores = np.array(rare_samples_filtered)[:,1]

bins = np.arange(0, max(long_tail_scores)+0.01, 0.01) # 定义区间边界

# 计算每个区间的计数
counts, _ = np.histogram(long_tail_scores, bins)

##################################################################################################################
#### data proportion #####

count_bins = []
# 定义前五个 bins 的权重和后面 bins 的权重
front_bin_weight = 1
mid_bin_weight= 3
back_bin_weight = 1
remain_data_size = dataset_size - Counter(labels)[5]

# 计算每个 bin 的比例
for i in range(0, len(bins) - 1):
    indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i+1]))[0]
    count_bins.append(round(len(indices_in_bin) / len(long_tail_scores), 4))

# 调整前五个 bins 和后面 bins 的比例
adjusted_count_bins = []
for i, count_bin in enumerate(count_bins): # [0,18]
    if i < 5:
        adjusted_count_bin = count_bin * front_bin_weight
    elif i > len(bins) -5:
        adjusted_count_bin = count_bin * back_bin_weight
    else:
        adjusted_count_bin = count_bin * mid_bin_weight

    adjusted_count_bins.append(adjusted_count_bin)


# 正规化，使得调整后的比例和为1
total_adjusted = sum(adjusted_count_bins)
adjusted_count_bins = [bin_count / total_adjusted for bin_count in adjusted_count_bins]

import math
# 根据调整后的比例计算 bins_threshold
bins_threshold = [math.ceil(count_bin * dataset_size) for count_bin in adjusted_count_bins]

###############################################################################################

#### selection strategy
remaining_samples_indices = []

for i in range(0, len(bins) - 1):
    indices_in_bin = np.where((long_tail_scores >= bins[i]) & (long_tail_scores < bins[i+1]))[0]
    # 计算当前 bin 的样本阈值
    high_quality_indices_5 = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] == 5]


    current_threshold = bins_threshold[i]
    
    # 如果当前 bin 的样本数量已经满足阈值，则不进行操作
    if len(indices_in_bin) <= current_threshold:
        remaining_samples_indices.extend(indices_in_bin)
        continue
    
    # 从高到低优先选择标签值高的样本
    selected_indices = []
    for label in range(5, 2, -1):  # 从标签4到标签1
        high_quality_indices = [idx for idx in indices_in_bin if labels[rare_samples_filtered[idx][0]] == label]
        
        if len(selected_indices) + len(high_quality_indices) <= current_threshold:
            selected_indices.extend(high_quality_indices)
        else:
            needed = current_threshold - len(selected_indices)
            selected_indices.extend(random.sample(high_quality_indices, needed))
            break
    
    remaining_samples_indices.extend(selected_indices)
    print(f"Bin {i} - total size: {len(indices_in_bin)} ---high-rated samples {len(high_quality_indices_5)} --- Total samples selected: {len(high_quality_indices_5 + selected_indices)}")





remaining_samples_idx = np.array(rare_samples_filtered, dtype=int)[remaining_samples_indices, 0]
remaining_samples_idx_2 = remaining_samples_idx
# long_tail_scores_filtered = long_tail_scores[remaining_samples_idx]
long_tail_scores_filtered = np.array(rare_samples_filtered)[remaining_samples_indices, 1]

# 打印剩余的样本及其原始索引
print("Size of the filtered dataset:", len(remaining_samples_idx))

'''filter out the corrupted samples and reconstruct the dataset'''

###the parquet data path

data = load_dataset('json', data_files=root_path + 'full_dataset.json')



filtered_dialogs = data['train'].select(remaining_samples_idx)


filtered_labels = np.array(labels)[remaining_samples_idx].tolist()

assert len(filtered_dialogs) == len(filtered_labels)


filtered_dialogs.to_json(root_path + f"filtered_4_dataset.json")


  from .autonotebook import tqdm as notebook_tqdm


==== Docta: Doctor for your data. Current version: 0.2 ====
cured sample size: 0
corrupted_samples_total: 187883
Original Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
label size: 300932
Revised Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
label_wise_filter_out_samples: 187883
Size of the remaining samples with high quality: 300932
Bin 0 - total size: 27879 ---high-rated samples 4 --- Total samples selected: 182
Bin 1 - total size: 1032 ---high-rated samples 6 --- Total samples selected: 19
Bin 2 - total size: 1753 ---high-rated samples 1 --- Total samples selected: 23
Bin 3 - total size: 2267 ---high-rated samples 3 --- Total samples selected: 31
Bin 4 - total size: 3149 ---high-rated samples 4 --- Total samples selected: 43
Bin 5 - total size: 4350 ---high-rated samples 7 --- Total samples selected: 165
Bin 6 - total size: 5886 ---high-rated samples 5 --- Total samples selected: 219
Bin 7 - total size:

Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 32.35ba/s]


20425224

## Filtered 5: label-filtered based: all 5 samples  + 4-rated samples select using sorted long-tail score (reverse=True)

In [2]:
import torch 
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

random.seed(3)

dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"

dataset_size = 10000

# label curation reports
report_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/labeling/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"
reports = torch.load(report_path)

# Part 1 (label-wise): label curation
# Choose the data index that needs to be removed
corrupted_samples = [x[0] for x in reports.detection['label_error']]

# Samples that can be cured
cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']:  # (idx, label, confidence)
    if sample[2] >= 1:  # confidence prob; 0.75
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))

print(f"Cured sample size: {len(cured_sample_labels)}")

# Filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"Corrupted samples total: {len(corrupted_samples_total)}")

# Change the original labels to the suggested label
root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

labels = torch.load(root_path + "output_labels_revised.pt")

print(f"Original Counter(labels): {Counter(labels)}")

for sample_label in cured_sample_labels:
    labels[sample_label[0]] = sample_label[1]

print(f"Label size: {len(labels)}")

# Select high-quality samples based on the quality labels
print(f"Revised Counter(labels): {Counter(labels)}")

# Filter out the low-quality samples
label_wise_filter_out_samples = set(corrupted_samples_total)
print(f"Label-wise filter out samples: {len(label_wise_filter_out_samples)}")

# Part 2 (feature-wise): handle the rare example
rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example']) // 2]
rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples]

print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

label_5_indices = [idx for idx in range(len(labels)) if labels[idx] == 5]
label_4_indices = [idx for idx in range(len(labels)) if labels[idx] == 4]

sample_rated_4 = [[sample[0], sample[1]] for sample in rare_samples if sample[0] in set(label_4_indices)]

remain_data_size = dataset_size - len(label_5_indices)

sorted_samples_rated_4 = sorted(sample_rated_4, key=lambda x: x[1], reverse=True)[:remain_data_size]

sorted_samples_rated_4_indices = np.array(sorted_samples_rated_4)[:, 0].astype(int)

remaining_samples_idx = np.concatenate([sorted_samples_rated_4_indices, label_5_indices])

print("Size of the filtered dataset:", len(remaining_samples_idx))

# Filter out the corrupted samples and reconstruct the dataset
data = load_dataset('json', data_files=root_path + 'full_dataset.json')

filtered_dialogs = data['train'].select(remaining_samples_idx.tolist())

filtered_labels = np.array(labels)[remaining_samples_idx].tolist()

assert len(filtered_dialogs) == len(filtered_labels)

filtered_dialogs.to_json(root_path + f"filtered_5_dataset.json")


==== Docta: Doctor for your data. Current version: 0.2 ====
Cured sample size: 0
Corrupted samples total: 187883
Original Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
Label size: 300932
Revised Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
Label-wise filter out samples: 187883
Size of the remaining samples with high quality: 300932
Size of the filtered dataset: 10000


Generating train split: 300932 examples [00:05, 53968.34 examples/s]
Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

26002307

In [5]:
sorted_samples_rated_4[-1]


[2971, 0.1253]

In [6]:
sorted_samples_rated_4 = sorted(sample_rated_4, key=lambda x: x[1], reverse=True)[:remain_data_size]

sorted_samples_rated_all = sorted(rare_samples_filtered, key=lambda x: x[1], reverse=True)


In [8]:
sorted_samples_rated_all[10000]

[238395, 0.1391]

## Filtered 6: label-filtered based: all 5 samples  + 4-rated samples select using sorted long-tail score (reverse=False)

In [1]:
import torch 
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

random.seed(42)

dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"

dataset_size = 10000

# label curation reports
report_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/labeling/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"
reports = torch.load(report_path)

# Part 1 (label-wise): label curation
# Choose the data index that needs to be removed
corrupted_samples = [x[0] for x in reports.detection['label_error']]

# Samples that can be cured
cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']:  # (idx, label, confidence)
    if sample[2] >= 1:  # confidence prob; 0.75
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))

print(f"Cured sample size: {len(cured_sample_labels)}")

# Filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"Corrupted samples total: {len(corrupted_samples_total)}")

# Change the original labels to the suggested label
root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

labels = torch.load(root_path + "output_labels_revised.pt")

print(f"Original Counter(labels): {Counter(labels)}")

for sample_label in cured_sample_labels:
    labels[sample_label[0]] = sample_label[1]

print(f"Label size: {len(labels)}")

# Select high-quality samples based on the quality labels
print(f"Revised Counter(labels): {Counter(labels)}")

# Filter out the low-quality samples
label_wise_filter_out_samples = set(corrupted_samples_total)
print(f"Label-wise filter out samples: {len(label_wise_filter_out_samples)}")

# Part 2 (feature-wise): handle the rare example
rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example']) // 2]
rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples]

print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

label_5_indices = [idx for idx in range(len(labels)) if labels[idx] == 5]
label_4_indices = [idx for idx in range(len(labels)) if labels[idx] == 4]

sample_rated_4 = [[sample[0], sample[1]] for sample in rare_samples if sample[0] in set(label_4_indices)]

remain_data_size = dataset_size - len(label_5_indices)

sorted_samples_rated_4 = sorted(sample_rated_4, key=lambda x: x[1], reverse=False)[:remain_data_size]

sorted_samples_rated_4_indices = np.array(sorted_samples_rated_4)[:, 0].astype(int)

remaining_samples_idx = np.concatenate([sorted_samples_rated_4_indices, label_5_indices])

print("Size of the filtered dataset:", len(remaining_samples_idx))

# Filter out the corrupted samples and reconstruct the dataset
data = load_dataset('json', data_files=root_path + 'full_dataset.json')

filtered_dialogs = data['train'].select(remaining_samples_idx.tolist())

filtered_labels = np.array(labels)[remaining_samples_idx].tolist()

assert len(filtered_dialogs) == len(filtered_labels)

filtered_dialogs.to_json(root_path + f"filtered_6_dataset.json")


  from .autonotebook import tqdm as notebook_tqdm


==== Docta: Doctor for your data. Current version: 0.2 ====
Cured sample size: 0
Corrupted samples total: 187883
Original Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
Label size: 300932
Revised Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
Label-wise filter out samples: 187883
Size of the remaining samples with high quality: 300932
Size of the filtered dataset: 10000


Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 35.59ba/s]


23231536

## Filter 7: label-filtered + different random seed 1

In [3]:
import torch 
from collections import Counter
import random
from datasets import load_dataset
random.seed(1)

dataset_name='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"
# model_name="gpt-4o-mini"

dataset_size =10000

all_train_dataset = load_dataset('json', data_files =f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/data/train_data/{dataset_name}_data.jsonl")



label_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/output_labels_revised.pt"
labels = torch.load(label_path)


label_counts = Counter(labels)

print(f"Counter(labels): {label_counts}")


# 获取所有标签为 5 的索引
index_5 = [i for i, label in enumerate(labels) if label == 5]

# 如果已经有10000个索引则直接返回
if len(index_5) >= dataset_size:
    selected_indices = index_5[:dataset_size]
else:

    # 获取所有标签为 4 的索引
    index_4 = [i for i, label in enumerate(labels) if label == 4]

    random_indices_4 = random.sample(index_4,  dataset_size - len(index_5))

    label_filtered_indices = index_5 + random_indices_4



label_filtered_dataset = all_train_dataset['train'].select(label_filtered_indices)

label_filtered_labels = np.array(labels)[label_filtered_indices].tolist()

root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

label_filtered_dataset.to_json(root_path + f"filtered_7_dataset.json")


Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})


Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 34.53ba/s]


25784325

## Filter 8: label-filtered + different random seed 2

In [5]:
import torch 
from collections import Counter
import random
from datasets import load_dataset
random.seed(2)

dataset_name='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"
# model_name="gpt-4o-mini"

dataset_size =10000

all_train_dataset = load_dataset('json', data_files =f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/data/train_data/{dataset_name}_data.jsonl")



label_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/output_labels_revised.pt"
labels = torch.load(label_path)


label_counts = Counter(labels)

print(f"Counter(labels): {label_counts}")


# 获取所有标签为 5 的索引
index_5 = [i for i, label in enumerate(labels) if label == 5]

# 如果已经有10000个索引则直接返回
if len(index_5) >= dataset_size:
    selected_indices = index_5[:dataset_size]
else:

    # 获取所有标签为 4 的索引
    index_4 = [i for i, label in enumerate(labels) if label == 4]

    random_indices_4 = random.sample(index_4,  dataset_size - len(index_5))

    label_filtered_indices = index_5 + random_indices_4



label_filtered_dataset = all_train_dataset['train'].select(label_filtered_indices)

label_filtered_labels = np.array(labels)[label_filtered_indices].tolist()

root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

label_filtered_dataset.to_json(root_path + f"filtered_8_dataset.json")


Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})


Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 36.37ba/s]


25557575

## Filter 9: label-filtered + different random seed 3

In [6]:
import torch 
from collections import Counter
import random
from datasets import load_dataset
random.seed(3)

dataset_name='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"
# model_name="gpt-4o-mini"

dataset_size =10000

all_train_dataset = load_dataset('json', data_files =f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/data/train_data/{dataset_name}_data.jsonl")



label_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/output_labels_revised.pt"
labels = torch.load(label_path)


label_counts = Counter(labels)

print(f"Counter(labels): {label_counts}")


# 获取所有标签为 5 的索引
index_5 = [i for i, label in enumerate(labels) if label == 5]

# 如果已经有10000个索引则直接返回
if len(index_5) >= dataset_size:
    selected_indices = index_5[:dataset_size]
else:

    # 获取所有标签为 4 的索引
    index_4 = [i for i, label in enumerate(labels) if label == 4]

    random_indices_4 = random.sample(index_4,  dataset_size - len(index_5))

    label_filtered_indices = index_5 + random_indices_4



label_filtered_dataset = all_train_dataset['train'].select(label_filtered_indices)

label_filtered_labels = np.array(labels)[label_filtered_indices].tolist()

root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

label_filtered_dataset.to_json(root_path + f"filtered_9_dataset.json")


Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})


Creating json from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 37.79ba/s]


25964083

## Random baseline + different random seed 1

In [7]:
from datasets import load_dataset
import random
import numpy as np

random.seed(1)


#### 
dataset_size = 10000
dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"


json_dir = '/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/data/train_data/'

all_train_dataset = load_dataset('json', data_files=json_dir+'all_train_data.jsonl')['train']




random_indices = np.random.permutation(len(all_train_dataset))[:dataset_size]

random_dataset = all_train_dataset.select(random_indices)

root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

all_train_dataset.to_json(root_path + f"full_dataset.json")

random_dataset.to_json(root_path + f"random_1_dataset.json")


Creating json from Arrow format: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 301/301 [00:04<00:00, 61.62ba/s]


17362444

## Random baseline + different random seed 2

In [1]:
from datasets import load_dataset
import random
import numpy as np

random.seed(2)


#### 
dataset_size = 10000
dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"


json_dir = '/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/data/train_data/'

all_train_dataset = load_dataset('json', data_files=json_dir+'all_train_data.jsonl')['train']




random_indices = np.random.permutation(len(all_train_dataset))[:dataset_size]

random_dataset = all_train_dataset.select(random_indices)

root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

all_train_dataset.to_json(root_path + f"full_dataset.json")

random_dataset.to_json(root_path + f"random_2_dataset.json")


  from .autonotebook import tqdm as notebook_tqdm
Creating json from Arrow format: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

17215744

In [2]:
from datasets import load_dataset
import random
import numpy as np

random.seed(5)


#### 
dataset_size = 10000
dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"


json_dir = '/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/data/train_data/'

all_train_dataset = load_dataset('json', data_files=json_dir+'all_train_data.jsonl')['train']




random_indices = np.random.permutation(len(all_train_dataset))[:dataset_size]

random_dataset = all_train_dataset.select(random_indices)

root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

all_train_dataset.to_json(root_path + f"full_dataset.json")

random_dataset.to_json(root_path + f"random_5_dataset.json")


Creating json from Arrow format: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 301/301 [00:03<00:00, 75.79ba/s]


17538885

In [None]:
import torch 
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

random.seed(3)

dataset_name ='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name="gpt-4o-mini"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"

dataset_size = 5000

# label curation reports
report_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/labeling/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"
reports = torch.load(report_path)

# Part 1 (label-wise): label curation
# Choose the data index that needs to be removed
corrupted_samples = [x[0] for x in reports.detection['label_error']]

# Samples that can be cured
cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']:  # (idx, label, confidence)
    if sample[2] >= 1:  # confidence prob; 0.75
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))

print(f"Cured sample size: {len(cured_sample_labels)}")

# Filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"Corrupted samples total: {len(corrupted_samples_total)}")

# Change the original labels to the suggested label
root_path = f"/home/azureuser/cloudfiles/code/Users/jinlong.pang/LADR_LLM_alignment_data_refinement/open-instruct/model_finetune_cluster/new_train_data/{model_name}/{dataset_name}/"

labels = torch.load(root_path + "output_labels_revised.pt")

print(f"Original Counter(labels): {Counter(labels)}")

for sample_label in cured_sample_labels:
    labels[sample_label[0]] = sample_label[1]

print(f"Label size: {len(labels)}")

# Select high-quality samples based on the quality labels
print(f"Revised Counter(labels): {Counter(labels)}")

# Filter out the low-quality samples
label_wise_filter_out_samples = set(corrupted_samples_total)
print(f"Label-wise filter out samples: {len(label_wise_filter_out_samples)}")

# Part 2 (feature-wise): handle the rare example
rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example']) // 2]
rare_samples_filtered = [[sample[0], sample[1]] for sample in rare_samples]

print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")


filtered_indices = []

for target_label in [5, 4, 3, 2, 1]:
    if len(filtered_indices) >= dataset_size:
        break

    label_indices = [idx for idx in range(len(labels)) if labels[idx] == target_label]

    if dataset_size - len(filtered_indices) > len(label_indices):
        filtered_indices.extend(label_indices)
    else:
        rated_samples = [[sample[0], sample[1]] for sample in rare_samples if sample[0] in set(label_indices)]

        remain_data_size = dataset_size - len(filtered_indices)

        sorted_samples = sorted(rated_samples, key=lambda x: x[1], reverse=True)[:remain_data_size]

        sorted_samples_indices = np.array(sorted_samples)[:, 0].astype(int)

        filtered_indices.extend(sorted_samples_indices)

    print("Size of the filtered dataset:", len(filtered_indices))

# Filter out the corrupted samples and reconstruct the dataset
data = load_dataset('json', data_files=root_path + 'full_dataset.json')


# Filter out the corrupted samples and reconstruct the dataset
data = load_dataset('json', data_files=root_path + 'full_dataset.json')

filtered_dialogs = data['train'].select(filtered_indices.tolist())

filtered_labels = np.array(labels)[filtered_indices].tolist()

assert len(filtered_dialogs) == len(filtered_labels)

filtered_dialogs.to_json(root_path + f"filtered-{dataset_size//1000}k_dataset.json")
