# Filtered method

- Dataset: stanford_alpaca
- Ours filtered curation method

In [2]:
import torch 
from collections import Counter
import random
from datasets import load_dataset
import numpy as np
import math

seed =3
random.seed(seed)
np.random.seed(seed)

dataset_name='stanford_alpaca'
# model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
model_name="gpt-4o-mini"
# model_name= "mistralai/Mistral-7B-Instruct-v0.3"

dataset_size = 9000
confidence_prob = 0.3

root_path = f"../model_finetune/selected_data/{model_name}/{dataset_name}/"

all_train_dataset = load_dataset('json', data_files =root_path + "full_dataset.json")

#################################################################################################################################
# label curation reports
# report_path = f"score_curation/results/{model_name}/{dataset_name}/{dataset_name}_report.pt"

report_path = f"results-embedding-model-bge/{model_name}/{dataset_name}/{dataset_name}_report.pt"



reports = torch.load(report_path)

# Part 1 (label-wise): label curation
corrupted_samples = [x[0] for x in reports.detection['label_error']]

cured_samples = []
cured_sample_labels = []
for sample in reports.curation['label_curation']:  # (idx, label, confidence)
    if sample[2] >= confidence_prob:  # confidence prob
        cured_samples.append(sample[0])
        cured_sample_labels.append((int(sample[0]), int(sample[1]), round(sample[2],2)))

print(f"Cured sample size: {len(cured_sample_labels)}")

# Filter out some cured samples from corrupted instances
cured_samples_set = set(cured_samples)
corrupted_samples_total = [x for x in corrupted_samples if x not in cured_samples_set]

print(f"Corrupted samples total: {len(corrupted_samples_total)}")

# Change the original labels to the suggested label


labels = torch.load(root_path + "output_labels_revised.pt")

print(f"Original Counter(labels): {Counter(labels)}")

count=0
#identify the transition labels
count_labels_5 = []
count_labels_4 = []
count_labels_3 = []
count_labels_2 = []

for sample_label in cured_sample_labels:
    if labels[sample_label[0]] == 5:
        count_labels_5.append(sample_label)
        # continue ## determine whether remain the 5-rated samples

    if labels[sample_label[0]] == 4:
        count_labels_4.append(sample_label)

    if labels[sample_label[0]] == 3:
        count_labels_3.append(sample_label)

    if labels[sample_label[0]] == 2:
        count_labels_2.append(sample_label)

    labels[sample_label[0]] = sample_label[1]
    count+=1

print(f"counting revised label size: {count}")

print(f"Label size: {len(labels)}")
label_counts = Counter(labels)

print(f"Revised Counter(labels): {label_counts}")

# Filter out the low-quality samples
label_wise_filter_out_samples = set(corrupted_samples_total)
print(f"Label-wise filter out samples: {len(label_wise_filter_out_samples)}")

### load the label noise 
# torch.save(labels, root_path + f"output_labels_revised_cured.pt")


Generating train split: 52002 examples [00:00, 291127.96 examples/s]
  reports = torch.load(report_path)


==== Docta: Doctor for your data. Current version: 0.2 ====
Cured sample size: 16513
Corrupted samples total: 1004
Original Counter(labels): Counter({2: 18624, 1: 15973, 0: 8274, 3: 8017, 4: 1092, 5: 22})
counting revised label size: 16513
Label size: 52002
Revised Counter(labels): Counter({2: 19470, 1: 18728, 0: 8635, 3: 4741, 4: 422, 5: 6})
Label-wise filter out samples: 1004


  labels = torch.load(root_path + "output_labels_revised.pt")


In [None]:
import torch
import random
import numpy as np
from datasets import load_dataset
from collections import Counter

# Set random seed for reproducibility
seed = 3
random.seed(seed)
np.random.seed(seed)

# Part 2 (feature-wise): Process rare samples based on 'rare_example' detection
rare_samples = reports.detection['rare_example'][:len(reports.detection['rare_example']) // 2]
rare_samples_filtered = np.array(rare_samples)[:, :2]  # Use NumPy for faster operations

print(f"Size of the remaining samples with high quality: {len(rare_samples_filtered)}")

# Assume 'labels' is a Python list; convert it to a NumPy array for efficient indexing
labels = np.array(labels)

# Cache label indices to avoid repeated searches
label_indices_cache = {label: np.where(labels == label)[0] for label in [5, 4, 3, 2, 1]}
print(f"Finished caching labels indices...")

# Initialize list to store selected indices
filtered_indices = []

# Filter and sort samples by label
for target_label in [5, 4, 3, 2, 1]:
    if len(filtered_indices) >= dataset_size:
        break

    # Get indices of current label
    label_indices = label_indices_cache[target_label]
    available_size = dataset_size - len(filtered_indices)

    # Add label indices if enough space, else sort and add top samples
    if available_size > len(label_indices):
        filtered_indices.extend(label_indices.tolist())
    else:
        # Filter and sort samples with the target label by score
        label_samples = rare_samples_filtered[np.isin(rare_samples_filtered[:, 0], label_indices)]
        if len(label_samples) > 0:  # Ensure label_samples is not empty
            sorted_samples = label_samples[label_samples[:, 1].argsort()[::-1]][:available_size]
            filtered_indices.extend(sorted_samples[:, 0].astype(int).tolist())

    print("Size of the filtered dataset:", len(filtered_indices))

# Load the dataset and filter out samples by selected indices
data = load_dataset('json', data_files=root_path + 'full_dataset.json')

# Select and save filtered samples
filtered_dialogs = data['train'].select(filtered_indices)
filtered_dialogs.to_json(root_path + f"filtered-cured-{confidence_prob}_dataset.json")
print(f"Filtered dataset saved to {root_path}filtered-cured-{confidence_prob}_dataset.json")
