In [None]:
from pipelines.custom_pipeline import cluster_custom, generate_ablation_sample
from datasets import load_dataset

ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
ds = ds["cleaned"]

In [None]:
from datasets import Dataset

# 1. Convert to pandas
df = ds.to_pandas()

# 2. Drop duplicates
df = df.drop_duplicates(subset=["content"])

# 3. (NEW STEP) Reset the pandas index and drop the old one
df = df.reset_index(drop=True) 

# 4. Convert back. The __index_level_0__ column will not be created.
ds = Dataset.from_pandas(df)

In [None]:
randomly_selected_ds = ds.shuffle(42).select(range(80000))

In [None]:
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer("asmud/nomic-embed-indonesian", cache_folder="cache/", trust_remote_code=True)
embeddings = sentence_model.encode(
    randomly_selected_ds["content"],
    batch_size=8,
    device="mps",
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True,
)

In [None]:
from pipelines.custom_pipeline import cluster_custom, generate_ablation_sample
reduced_embedding, cluster_labels, cluster_probabilties = cluster_custom(embeddings, seed=42, mode="dbcv")

In [None]:
import numpy as np
unique_labels, label_counts = np.unique(cluster_labels, return_counts=True)
cluster_dict = dict(zip(unique_labels, label_counts))
print(cluster_dict)

In [None]:
from datasets import load_dataset
fresh_ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
for i in range(1000, 21000, 1000):
    sampled_dataset = generate_ablation_sample(randomly_selected_ds, cluster_labels, cluster_probabilties, i, seed=42, filter_mode="above_mean_std", inter_cluster_strategy="direct_proportion", intra_cluster_bias="inverse_prob")
    fresh_ds[f"sampled_{i}"] = sampled_dataset

In [None]:
fresh_ds.push_to_hub("tianharjuno/twitter-parse", commit_description="Created subsamples for training")