train the pipeline, with a larger amount

mix in the training data with the labeled training data

then we can retrieve more samples of a certain type

in this case, we only care about relevant samples

In [1]:
from datasets import load_dataset, ClassLabel
class_labels = ClassLabel(3, ["Negative", "Neutral", "Positive"])
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

In [4]:
import re, unicodedata, jaconv, emoji

_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_WS       = re.compile(r'\s+')
_KUTI_CUT = re.compile(r'(?i)kutipan.*$', re.DOTALL)

# --- (MODIFIED) ---
# Catches "word" + "dari" + "domain.com" -> replaces with "word"
# Changed \w+ to \S+ to include punctuation like '!'
_DARI_URL_ATTACHED = re.compile(r'(\S+)dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# Catches " dari " + "domain.com" -> replaces with empty string
_DARI_URL_SPACED = re.compile(r'\s+dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# --- (NEW) ---
# Catches any word ending in "dari" (e.g., "anarko!dari", "negaradari")
_DARI_STUCK = re.compile(r'(\S+)dari\b', re.I)

def cleantext(row: str):
    text = row["content"] #type: ignore
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    text = text.replace("tanya grok", " ")
    text = text.replace("grokproductivitypasang", " ")
    text = text.replace('\\n', ' ').replace('\\r', ' ')

    # Handle standard URLs first
    text = _URL.sub(' <url> ', text)
    text = text.replace('ini tidak tersedia', ' ')

    text = _MENTION.sub('@USER', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)
    text = _KUTI_CUT.sub('', text)

    # text = _DARI_URL_ATTACHED.sub(r'\1', text)
    # text = _DARI_URL_SPACED.sub('', text)
    # text = _DARI_STUCK.sub(r'\1', text)

    text = _WS.sub(' ', text).strip()
    row["content"] = text #type: ignore
    return row

In [2]:
from datasets import Dataset, concatenate_datasets
source_ds = dataset["source_labeled"]
source_ds = source_ds.map(cleantext, num_proc=16)

relevant_ds = source_ds.filter(lambda ex: ex["relevant"] == True)
relevant_df = relevant_ds.to_pandas()
relevant_df = relevant_df.drop_duplicates(subset="content", keep="first").reset_index(drop=True)
relevant_ds = Dataset.from_pandas(relevant_df)

train_ds = dataset["train_sentiment"]
test_ds = dataset["test_sentiment"]

train_ids = set(train_ds["content"])
test_ids = set(test_ds["content"])

filtered = relevant_ds.filter(
    lambda ex: ex["content"] not in train_ids and ex["content"] not in test_ids
)

dataset_list = []

dataset_list.append(train_ds)
unlabeled_ds = filtered.train_test_split(train_size=50000, seed=42)["train"]
def reset_sentiment(row):
    row["sentiment"] = -1
    return row
unlabeled_ds = unlabeled_ds.map(reset_sentiment)
dataset_list.append(unlabeled_ds)

concat_ds = concatenate_datasets(dataset_list)

NameError: name 'cleantext' is not defined

In [3]:
import re
# We are NOT importing replace_word_elongation anymore
from indoNLP.preprocessing import emoji_to_words

def clean_tweet_for_nusabert(row):
    text = row['content']

    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # 3. Remove mentions and RT
    # This regex is safe and does not affect 'ruu tni'
    text = re.sub(r'rt @\S+|@\S+', '', text)

    # 4. Remove hashtags (keep the word)
    text = re.sub(r'#(\S+)', r'\1', text)

    # 5. Convert emojis to words (Preserves sentiment)
    text = emoji_to_words(text)

    # 6. Normalize word elongation (CUSTOM, SAFER REGEX)
    # This replaces 3 or more repeated chars (e.g., 'bangeeet' -> 'banget')
    # It will NOT affect 'uu' or 'ruu', fixing your bug.
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    # 7. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    row["content"] = text
    return row

In [22]:
concat_ds = concat_ds.map(clean_tweet_for_nusabert, num_proc=16)

NameError: name 'concat_ds' is not defined

In [6]:
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer("LazarusNLP/all-nusabert-large-v4",
                                           cache_folder="/data/cache/",
                                           device="cuda",
                                           )
embeddings = sentence_transformer.encode(concat_ds["content"], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, device="cuda", batch_size=128)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [7]:
from dream_pipeline import DreamCluster
clusterer = DreamCluster("stability")
clusterer.fit(embeddings)

Finding intrinsic dimension with TwoNN
Found intrinsic dimension: 20
Running first stage reduction
Running second stage reduction
Tuning HDBSCAN with mode stability
  -> Using 'stability' mode (Bayesian Optimization on relative_validity)
Transforming full dataset with trained reducers...
Fitting final clusterer on full reduced dataset...


(np.int64(146), np.int64(32))

In [8]:
cluster_labels, cluster_probabilities, reduced_embeddings = clusterer.predict(embeddings=embeddings)

In [9]:
import numpy as np
unique_labels, unique_label_counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique_labels, unique_label_counts):
    indices = np.where(cluster_labels == label)[0]
    cluster_ds = concat_ds.select(indices)
    cluster_classes, cluster_class_counts = np.unique(cluster_ds["sentiment"], return_counts=True)
    print(f"CLUSTER {label}: {dict(zip(cluster_classes, cluster_class_counts))}")

CLUSTER -1: {np.int64(-1): np.int64(7318), np.int64(0): np.int64(3598), np.int64(1): np.int64(538), np.int64(2): np.int64(210)}
CLUSTER 0: {np.int64(-1): np.int64(260), np.int64(1): np.int64(9), np.int64(2): np.int64(173)}
CLUSTER 1: {np.int64(-1): np.int64(385), np.int64(0): np.int64(1), np.int64(1): np.int64(3), np.int64(2): np.int64(222)}
CLUSTER 2: {np.int64(-1): np.int64(451), np.int64(0): np.int64(244), np.int64(1): np.int64(5), np.int64(2): np.int64(2)}
CLUSTER 3: {np.int64(-1): np.int64(195), np.int64(1): np.int64(1), np.int64(2): np.int64(109)}
CLUSTER 4: {np.int64(-1): np.int64(142), np.int64(0): np.int64(83), np.int64(1): np.int64(11)}
CLUSTER 5: {np.int64(-1): np.int64(1677), np.int64(0): np.int64(874), np.int64(1): np.int64(53), np.int64(2): np.int64(72)}
CLUSTER 6: {np.int64(-1): np.int64(370), np.int64(0): np.int64(154), np.int64(1): np.int64(91), np.int64(2): np.int64(1)}
CLUSTER 7: {np.int64(-1): np.int64(89), np.int64(0): np.int64(25), np.int64(1): np.int64(23), np.in

In [10]:
import json
from datetime import datetime
from collections import Counter, defaultdict


def build_labeling_json(concat_ds, cluster_labels, cluster_probabilities, output_path):
    """
    Build labeling JSON for HTML labeling suite.

    Parameters:
    -----------
    concat_ds : HuggingFace Dataset
        Must contain columns: tweet_id, time, author, content, sentiment
    cluster_labels : Sequence[int]
        Cluster assignment per sample
    cluster_probabilities : Sequence[float]
        Cluster probability per sample
    output_path : str
        Where to save the output JSON file
    """

    # Convert to Python lists for safety
    cluster_labels = list(cluster_labels)
    cluster_probabilities = list(cluster_probabilities)

    # Basic checks
    assert len(concat_ds) == len(cluster_labels)
    assert len(concat_ds) == len(cluster_probabilities)

    # Organize samples by cluster
    clusters = defaultdict(list)
    for idx, row in enumerate(concat_ds):
        cid = int(cluster_labels[idx])
        prob = float(cluster_probabilities[idx])

        original_label = int(row["sentiment"])
        is_unlabeled = original_label == -1

        sample = {
            "tweet_id": row["tweet_id"],
            "content": row["content"],
            "author": row.get("author", None),
            "time": row.get("time", None),

            "cluster_probability": prob,
            "original_label": original_label,
            "current_label": original_label,
            "is_unlabeled": is_unlabeled,
            "needs_check": False  # Will fill below
        }

        clusters[cid].append(sample)

    # Build cluster stats + mark minority labels
    cluster_blocks = []
    for cid, samples in clusters.items():
        # Collect stats
        label_counts = Counter(s["original_label"] for s in samples if s["original_label"] != -1)
        size = len(samples)

        # Determine "dominant" class for minority detection
        if len(label_counts) > 0:
            dominant_label, dominant_count = label_counts.most_common(1)[0]
        else:
            dominant_label, dominant_count = None, 0  # All unlabeled cluster

        # Mark minority samples
        for s in samples:
            orig = s["original_label"]
            if orig != -1 and orig != dominant_label:
                s["needs_check"] = True

        cluster_blocks.append({
            "cluster_id": cid,
            "stats": {
                "size": size,
                "labels": {str(k): v for k, v in label_counts.items()}
            },
            "samples": samples
        })

    # Compose final JSON
    output_json = {
        "metadata": {
            "version": 1,
            "generated_at": datetime.utcnow().isoformat(),
            "num_clusters": len(cluster_blocks),
            "num_samples": len(concat_ds),

            "label_schema": {
                "-1": "Unlabeled",
                "0": "Negative",
                "1": "Neutral",
                "2": "Positive"
            }
        },
        "clusters": cluster_blocks
    }

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_json, f, indent=2, ensure_ascii=False)

    print(f"[OK] Labeling JSON exported to: {output_path}")


In [11]:

build_labeling_json(
    concat_ds=concat_ds,
    cluster_labels=cluster_labels,
    cluster_probabilities=cluster_probabilities,
    output_path="out/sentiment_train_data_unlabeled.json"
)

[OK] Labeling JSON exported to: labeling_data.json


In [4]:
import json

with open("out/sentiment_train_data_labeled.json", "r", encoding="utf-8") as f:
    data = json.load(f)

all_samples = []

for cluster in data["clusters"]:
    samples = cluster.get("samples", [])
    all_samples.extend(samples)

print(f"Total samples collected: {len(all_samples)}")

Total samples collected: 80000


In [10]:
from datasets import Dataset, load_dataset
original_dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
sentiment_train_ds = original_dataset["train_sentiment"]
source_labeled = original_dataset["source_labeled"]
parsed_ds = Dataset.from_list(all_samples)
parsed_ds = parsed_ds.filter(lambda row: row["original_label"] != row["current_label"])

Filter:   0%|          | 0/80000 [00:00<?, ? examples/s]

In [16]:
from datasets import concatenate_datasets

# 1. Create a mapping of ID -> New Label from your corrected data
# We assume 'parsed_ds' is already filtered for changes as per your code
update_map = {row['tweet_id']: row['current_label'] for row in parsed_ds}
correction_ids = set(update_map.keys())

# 2. distinct sets for logic
existing_train_ids = set(sentiment_train_ds['tweet_id'])
ids_to_update = existing_train_ids.intersection(correction_ids)
ids_to_add = correction_ids.difference(existing_train_ids)

# 3. Update the existing rows in sentiment_train_ds
def update_existing_sentiment(row):
    if row['tweet_id'] in ids_to_update:
        row['sentiment'] = update_map[row['tweet_id']]
    return row

updated_train_ds = sentiment_train_ds.map(update_existing_sentiment)

# 4. specific handling for NEW rows (referencing source_labeled)
if len(ids_to_add) > 0:
    # Filter source_labeled to get the raw data for the new IDs
    source_labeled = original_dataset["source_labeled"]
    new_rows_ds = source_labeled.filter(lambda row: row['tweet_id'] in ids_to_add)

    # Apply the new labels and ensure relevance is set to True
    def prepare_new_rows(row):
        row['sentiment'] = update_map[row['tweet_id']]
        row['relevant'] = True
        return row

    new_rows_ds = new_rows_ds.map(prepare_new_rows)

    # 5. Ensure column consistency before merging
    # We select only columns present in the training set to avoid schema conflicts
    train_columns = updated_train_ds.column_names
    new_rows_ds = new_rows_ds.select_columns(train_columns)

    # 6. Concatenate
    final_train_ds = concatenate_datasets([updated_train_ds, new_rows_ds])
else:
    final_train_ds = updated_train_ds

print(f"Original Size: {len(sentiment_train_ds)}")
print(f"New Size: {len(final_train_ds)}")

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/247820 [00:00<?, ? examples/s]

Map:   0%|          | 0/1612 [00:00<?, ? examples/s]

Original Size: 30000
New Size: 31612


In [19]:
final_train_ds

Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
    num_rows: 31612
})

In [20]:
import numpy as np
sentiment_column = final_train_ds["sentiment"]
original_dataset["train_sentiment"] = final_train_ds
original_dataset.push_to_hub("tianharjuno/twitter-parse", commit_message="added more relevant data through clustering")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/aea7d115dcf04d8ed008f60db3fd834ea99dc199', commit_message='added more relevant data through clustering', commit_description='', oid='aea7d115dcf04d8ed008f60db3fd834ea99dc199', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)

In [5]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, Dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
relevant_ds = dataset["source_labeled"].filter(lambda row: row["relevant"] == True)
test_ds = dataset["test_sentiment"]

relevant_ds = relevant_ds.map(cleantext, num_proc=10)
test_ds = test_ds.map(cleantext, num_proc=10)

relevant_ds = relevant_ds.map(clean_tweet_for_nusabert, num_proc=10)
test_ds = test_ds.map(clean_tweet_for_nusabert, num_proc=10)

relevant_df = relevant_ds.to_pandas().drop_duplicates(subset="content", keep="first").reset_index(drop=True)
relevant_ds = Dataset.from_pandas(relevant_df)
relevant_ds = relevant_ds.train_test_split(train_size=80000, shuffle=True, seed=42)["train"]

sentence_transformer = SentenceTransformer("LazarusNLP/all-nusabert-large-v4",
                                           cache_folder="/data/cache/",
                                           device="cuda",
                                           )
train_embeddings = sentence_transformer.encode(relevant_ds["content"], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, batch_size=32)
test_embeddings = sentence_transformer.encode(test_ds["content"], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, batch_size=32)



Map (num_proc=10):   0%|          | 0/147701 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/147701 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/10000 [00:00<?, ? examples/s]

Batches:   0%|          | 0/2500 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [6]:
from dream_pipeline import DreamCluster
clusterer = DreamCluster("stability")
clusterer.fit(train_embeddings)

Finding intrinsic dimension with TwoNN
Found intrinsic dimension: 20
Running first stage reduction
Running second stage reduction
Tuning HDBSCAN with mode stability
  -> Using 'stability' mode (Bayesian Optimization on relative_validity)
Transforming full dataset with trained reducers...
Fitting final clusterer on full reduced dataset...


(np.int64(11), np.int64(231))

In [7]:
cluster_labels, cluster_probabilities, reduced_embeddings = clusterer.predict(test_embeddings)

In [8]:
import json

# 1. Add Cluster and Probability (Your existing logic)
if "cluster" not in test_ds.column_names:
    test_ds = test_ds.add_column("cluster", cluster_labels)
if "probability" not in test_ds.column_names:
    test_ds = test_ds.add_column("probability", cluster_probabilities)

# 2. Setup Labeling Columns
# We keep 'original_label' as a backup, and 'sentiment' as the field to be edited.
if "original_label" not in test_ds.column_names:
    test_ds = test_ds.add_column("original_label", test_ds["sentiment"])

# Optimization: add_column is much faster than map for constant values
if "status" not in test_ds.column_names:
    test_ds = test_ds.add_column("status", ["uncorrected"] * len(test_ds))

# 3. Add ID (Required for UI logic)
if "id" not in test_ds.column_names:
    test_ds = test_ds.add_column("id", range(len(test_ds)))

# 4. Add 'relevant' column (Required for the schema)
# If you don't have relevancy data, we default to 1 (Relevant) so they show up in the UI.
if "relevant" not in test_ds.column_names:
    test_ds = test_ds.add_column("relevant", [1] * len(test_ds))

# 5. Export to JSON with Schema Mapping
# We use a list comprehension to strictly enforce the key names required by the HTML.
def export_to_labeling_format(dataset, output_file="sentiment_data.json"):
    export_data = []

    for row in dataset:
        # Map dataset columns to UI schema keys
        # row.get(x, y) tries to find column x, returns y if not found
        item = {
            "id": row["id"],
            "cluster": row["cluster"],
            "probability": row["probability"],
            # Map 'text' to 'content' if necessary
            "content": row.get("content", row.get("text", "")),
            "relevant": row["relevant"],
            "original_label": row["original_label"],
            "status": row["status"],
            "sentiment": row["sentiment"], # This is the active column for 0, 1, 2

            # Metadata (Fill with defaults if your dataset doesn't have them)
            "tweet_id": str(row.get("tweet_id", row.get("id", ""))),
            "time": row.get("time", row.get("created_at", "")),
            "author": row.get("author", row.get("username", "Unknown")),
            "comment_count": row.get("comment_count", 0),
            "repost_count": row.get("repost_count", 0),
            "like_count": row.get("like_count", 0),
            "view_count": row.get("view_count", 0),
        }
        export_data.append(item)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(export_data, f, indent=2)

    print(f"Exported {len(export_data)} items to {output_file}")

# Run the export
export_to_labeling_format(test_ds)

Exported 10000 items to sentiment_data.json


In [1]:
from datasets import load_dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
test_ds = dataset["test_sentiment"]
test_ds.column_names

['tweet_id',
 'time',
 'author',
 'content',
 'comment_count',
 'repost_count',
 'like_count',
 'view_count',
 'relevant',
 'sentiment']

In [3]:
import json

# 1. Load your JSON list (assuming it's in a variable called `corrected_labels`)
# If it's in a file:
with open('sentiment_corrected.json', 'r') as f:
    corrected_labels = json.load(f)

# 2. Create a high-speed lookup dictionary
# We use 'tweet_id' as the key.
# CRITICAL: We convert keys to strings to ensure matching works reliably.
correction_map = {
    str(item['tweet_id']): item['sentiment']
    for item in corrected_labels
    # Optional: Uncomment below if you only want to apply items marked 'corrected'
    # if item['status'] == 'corrected'
}

print(f"Loaded {len(correction_map)} corrections.")

# 3. Define the update function
def apply_corrections(example):
    # Ensure the dataset ID is also a string for comparison
    t_id = str(example['tweet_id'])

    if t_id in correction_map:
        example['sentiment'] = correction_map[t_id]

    return example

# 4. Apply to the dataset
# distinct=False ensures we don't accidentally drop duplicates if that's not intended
updated_test_ds = test_ds.map(apply_corrections)

# Verification: Check if it worked
# Compare a specific ID you know changed, or just print the first few
print("Update complete.")
print(updated_test_ds[0])

Loaded 10000 corrections.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Update complete.
{'tweet_id': '1904997744181436599', 'time': '2025-03-26T20:43:50.000', 'author': '@ramakszl', 'content': 'dari sekian banyak demo yang begitu maraknya di setiap daerah tapi apa kebijakan mereka, yap betul bikin pengalihan isu soal selangakan pejabat rakyat demo tu di simak pukimak, rakyat gak setolol wapres #TolakRUUTNI #TolakRUUPolri #IndonesiaGelap', 'comment_count': 0, 'repost_count': 0, 'like_count': 0, 'view_count': 53, 'relevant': True, 'sentiment': 0}


In [4]:
dataset["test_sentiment"] = updated_test_ds
dataset.push_to_hub("tianharjuno/twitter-parse", commit_message="Fixed test_ds with clustering")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/76a5543d25ca67ee8aa54bc45d8ec22b3f78d192', commit_message='Fixed test_ds with clustering', commit_description='', oid='76a5543d25ca67ee8aa54bc45d8ec22b3f78d192', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)