### Label curation 

- First Step to initialize

In [8]:
import torch 
from collections import Counter
import random
from datasets import load_dataset
import numpy as np

seed =3
random.seed(seed)
np.random.seed(seed)

dataset_name='all_train'
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct" ###confidence_prob = 0.4
# model_name="gpt-4o-mini" #####confidence_prob = 0.5
# model_name= "mistralai/Mistral-7B-Instruct-v0.3" #######confidence_prob = 0.5

confidence_prob = 0.4
dataset_size = 40000


# all_train_dataset = load_dataset('json', data_files =f"./data/train_data/{dataset_name}_data.jsonl")
all_train_dataset = load_dataset('json', data_files =f"../{dataset_name}_dataset.json")

#################################################################################################################################
# label curation reports
# report_path = f"score_curation/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"

report_path = f"./results-embedding-model-bge/{model_name}/{dataset_name}/{dataset_name}_report.pt"



reports = torch.load(report_path)

# Part 1 (label-wise): label curation
corrupted_samples = [x[0] for x in reports.detection['label_error']]

cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']:  # (idx, label, confidence)
    if sample[2] >= confidence_prob:  # confidence prob
        cured_samples.append(sample[0])
        cured_sample_labels.append((sample[0], sample[1]))

print(f"Cured sample size: {len(cured_sample_labels)}")

# Filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"Corrupted samples total: {len(corrupted_samples_total)}")

# Change the original labels to the suggested label
root_path = f"../model_finetune/selected_data/{model_name}/{dataset_name}/"


labels = torch.load(root_path + "output_labels_revised.pt")

print(f"Original Counter(labels): {Counter(labels)}")

count=0
count_labels_5 = []
count_labels_4 = []
count_labels_3 = []
count_labels_2 = []

for sample_label in cured_sample_labels:
    if labels[sample_label[0]] == 5:
        count_labels_5.append(sample_label[1])
        # continue ## determine whether remain the 5-rated samples

    if labels[sample_label[0]] == 4:
        count_labels_4.append(sample_label[1])

    if labels[sample_label[0]] == 3:
        count_labels_3.append(sample_label[1])

    if labels[sample_label[0]] == 2:
        count_labels_2.append(sample_label[1])

    labels[sample_label[0]] = sample_label[1]
    count+=1

print(f"counting revised label size: {count}")

print(f"Label size: {len(labels)}")
label_counts = Counter(labels)

print(f"Revised Counter(labels): {label_counts}")

# Filter out the low-quality samples
label_wise_filter_out_samples = set(corrupted_samples_total)
print(f"Label-wise filter out samples: {len(label_wise_filter_out_samples)}")

### load the label noise 

# torch.save(labels, root_path + f"output_labels_revised_cured.pt")



  reports = torch.load(report_path)


Cured sample size: 32123
Corrupted samples total: 158013
Original Counter(labels): Counter({3: 116114, 4: 57669, 2: 48254, 1: 47402, 0: 27386, 5: 4107})
counting revised label size: 32123
Label size: 300932
Revised Counter(labels): Counter({3: 139351, 4: 55491, 1: 42337, 2: 38460, 0: 22580, 5: 2713})
Label-wise filter out samples: 158013


  labels = torch.load(root_path + "output_labels_revised.pt")


In [93]:
Counter(count_labels_5)

Counter({3: 8, 4: 57})

In [94]:
Counter(count_labels_4)

Counter({3: 2538, 2: 84, 1: 8})

In [95]:
Counter(count_labels_3)

Counter({4: 1416, 1: 28, 2: 402})

In [96]:
Counter(count_labels_2)

Counter({4: 269, 3: 2952, 1: 32})

## Filetered method: Our data selection method
-  Filtered 5: label-filtered based: all 5 samples  + 4-rated samples select using sorted long-tail score (reverse=True)

In [97]:
import torch
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

# Set random seed for reproducibility
seed = 3
random.seed(seed)
np.random.seed(seed)

# Part 2 (feature-wise): Process rare samples
# Filter top 50% rare samples based on list in 'reports'
rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example']) // 2]
rare_samples_filtered = np.array(rare_samples)[:, :2]  # Retain only the first two columns (index and score)

print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

# Convert 'labels' list to NumPy array for efficient indexing
labels = np.array(labels)

# Cache indices for each label value to avoid repeated calculations
label_indices_cache = {label: np.where(labels == label)[0] for label in [5, 4, 3, 2, 1]}
print(f"Finished caching labels indices...")

# Initialize list to store selected indices
filtered_indices = []

# Filter and sort samples by label, prioritizing high-quality samples
for target_label in [5, 4, 3, 2, 1]:
    if len(filtered_indices) >= dataset_size:
        break

    # Retrieve indices for the current label from the cache
    label_indices = label_indices_cache[target_label]
    available_size = dataset_size - len(filtered_indices)

    # Add all label indices if there is enough space, else add only top samples
    if available_size > len(label_indices):
        filtered_indices.extend(label_indices.tolist())
    else:
        # Filter rare samples for the current label
        label_samples = rare_samples_filtered[np.isin(rare_samples_filtered[:, 0], label_indices)]
        if len(label_samples) > 0:  # Ensure label_samples is not empty
            # Sort samples by score in descending order and select top samples
            sorted_samples = label_samples[label_samples[:, 1].argsort()[::-1]][:available_size]
            filtered_indices.extend(sorted_samples[:, 0].astype(int).tolist())

    print("Size of the filtered dataset:", len(filtered_indices))

# Load the dataset and filter out invalid samples
data = load_dataset('json', data_files=root_path + 'full_dataset.json')

# Select samples based on filtered indices and save as JSON
filtered_dialogs = data['train'].select(filtered_indices)
filtered_dialogs.to_json(root_path + f"filtered-cured-{confidence_prob}-{dataset_size // 1000}k_dataset.json")


Size of the remaining samples with high quality: 300932


Finished caching labels indices...
Size of the filtered dataset: 422
Size of the filtered dataset: 2500


Creating json from Arrow format: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 31.53ba/s]


6225106