In [3]:
from datasets import load_dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="/data/cache/")

In [4]:
from datasets import ClassLabel
class_labels = ClassLabel(2, ["Irrelevant", "Relevant"])
train_ds = dataset["cleaned"].train_test_split(train_size=80000)["train"].select_columns(["content", "relevant"])
train_ds = train_ds.rename_column("relevant", "label")
train_ds = train_ds.cast_column("label", class_labels)

train_ds

Casting the dataset:   0%|          | 0/80000 [00:00<?, ? examples/s]

Dataset({
    features: ['content', 'label'],
    num_rows: 80000
})

In [5]:
import re
# We are NOT importing replace_word_elongation anymore
from indoNLP.preprocessing import emoji_to_words

def clean_tweet_for_nusabert(row):
    text = row['content']

    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # 3. Remove mentions and RT
    # This regex is safe and does not affect 'ruu tni'
    text = re.sub(r'rt @\S+|@\S+', '', text)

    # 4. Remove hashtags (keep the word)
    text = re.sub(r'#(\S+)', r'\1', text)

    # 5. Convert emojis to words (Preserves sentiment)
    text = emoji_to_words(text)

    # 6. Normalize word elongation (CUSTOM, SAFER REGEX)
    # This replaces 3 or more repeated chars (e.g., 'bangeeet' -> 'banget')
    # It will NOT affect 'uu' or 'ruu', fixing your bug.
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    # 7. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    row["content"] = text
    return row

In [6]:
sentence_train_ds = train_ds.map(clean_tweet_for_nusabert, num_proc=30)

Map (num_proc=30):   0%|          | 0/80000 [00:00<?, ? examples/s]

In [8]:
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer("LazarusNLP/all-nusabert-large-v4",
                                           cache_folder="/data/cache/",
                                           device="cuda",
                                           )
embeddings = sentence_transformer.encode(sentence_train_ds["content"], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, device="cuda", batch_size=256)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [9]:
from dream_pipeline import DreamCluster
clusterer = DreamCluster("stability")
clusterer.fit(embeddings)

Finding intrinsic dimension with TwoNN
Found intrinsic dimension: 19
Running first stage reduction
Running second stage reduction
Tuning HDBSCAN with mode stability
  -> Using 'stability' mode (Bayesian Optimization on relative_validity)
Transforming full dataset with trained reducers...
Fitting final clusterer on full reduced dataset...


(np.int64(179), np.int64(67))

In [10]:
test_ds = dataset["test"]
test_embeddings = sentence_transformer.encode(test_ds["content"], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, device="cuda")

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [11]:
train_labels, train_probabilities, reduced_embeddings = clusterer.predict(test_embeddings)

In [36]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("tianharjuno/ruu-tni-relevancy-classification-p1", cache_dir="/data/cache/", num_labels=len(class_labels.names))
tokenizer = AutoTokenizer.from_pretrained("tianharjuno/ruu-tni-relevancy-classification-p1", cache_dir="/data/cache/")
model.to(device)
trainer_arguments = TrainingArguments(
    per_device_eval_batch_size=512
)
trainer = Trainer(model=model, tokenizer=tokenizer, args=trainer_arguments)

def tokenize(batch):
    return tokenizer(batch["content"], return_tensors="pt", padding=True, truncation=True, max_length=512)

encoded_test_ds = test_ds.map(tokenize, batched=True, batch_size=256)
predictions = trainer.predict(encoded_test_ds)
predicted_class_ids = predictions.predictions.argmax(axis=1)

In [70]:
import numpy as np
import torch
from typing import Dict, Any, Tuple, List
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# NOTE: This solution assumes compute_metrics is available and properly defined
# in your environment.
def compute_metrics(class_names):
    num_classes = len(class_names)
    def callback(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, torch.Tensor):
            logits = logits.detach().cpu().numpy()
        if isinstance(labels, torch.Tensor):
            labels = labels.detach().cpu().numpy()
        preds = np.argmax(logits, axis=1)
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            labels, preds, average="macro", zero_division=0
        )
        acc = accuracy_score(labels, preds)
        p_cls, r_cls, f1_cls, support_cls = precision_recall_fscore_support(
            labels,
            preds,
            average=None,
            zero_division=0,
            labels=list(range(num_classes)),
        )
        metrics = {
            "accuracy": acc,
            "macro_f1": macro_f1,
            "macro_precision": macro_p,
            "macro_recall": macro_r,
        }
        for idx, name in enumerate(class_names):
            metrics[f"{name}_precision"] = p_cls[idx]  # type: ignore
            metrics[f"{name}_recall"] = r_cls[idx]  # type: ignore
            metrics[f"{name}_f1"] = f1_cls[idx]  # type: ignore
            metrics[f"{name}_support"] = int(support_cls[idx])  # type: ignore
        return metrics
    return callback
def calculate_cluster_metrics(
    eval_predictions: Tuple[Any, Any],
    cluster_assignments: np.ndarray,
    class_names: List[str],
    cluster_prefix: str = "cluster",
) -> Dict[str, Any]:
    """
    Separates the evaluation predictions by cluster and calculates performance
    metrics for the model on each cluster subset.

    Args:
        eval_predictions: A tuple (logits, labels) from the trainer's predict/evaluate step.
        cluster_assignments: A 1D numpy array of cluster indices.
        class_names: List of class names (e.g., ["0", "1", "2"]).
        cluster_prefix: Prefix for the metric keys.

    Returns:
        A dictionary containing combined metrics for all clusters.
    """
    logits, labels = eval_predictions
    combined_cluster_metrics: Dict[str, Any] = {}

    ## ðŸ’¥ CRITICAL FIX: Ensure logits and labels are NumPy arrays for boolean indexing.
    # Logits conversion
    if isinstance(logits, torch.Tensor):
        logits = logits.detach().cpu().numpy()
    elif isinstance(logits, list):
        logits = np.array(logits)

    # Labels conversion
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()
    elif isinstance(labels, list):
        labels = np.array(labels)

    # Check if cluster_assignments is a numpy array (it should be)
    if not isinstance(cluster_assignments, np.ndarray):
        cluster_assignments = np.array(cluster_assignments)

    unique_clusters = np.unique(cluster_assignments)

    # Get the inner callback from the user's compute_metrics function
    metrics_callback = compute_metrics(class_names)

    for cluster_id in unique_clusters:
        # Create a boolean mask for the current cluster
        mask = cluster_assignments == cluster_id

        # Filter the logits and true labels for the current cluster
        cluster_logits = logits[mask]
        cluster_labels = labels[mask]
        cluster_support = len(cluster_labels)

        if cluster_support == 0:
            continue

        # Calculate metrics for the current cluster
        cluster_eval_preds = (cluster_logits, cluster_labels)
        cluster_metrics = metrics_callback(cluster_eval_preds)

        # Add cluster size/support
        cluster_metrics["support"] = cluster_support

        # Rename and combine metrics with a clear prefix
        metric_prefix = f"{cluster_prefix}_{cluster_id}"
        for key, value in cluster_metrics.items():
            combined_cluster_metrics[f"{metric_prefix}_{key}"] = value

    return combined_cluster_metrics

In [71]:
test_logits = predictions.predictions # Assuming the first element is logits
test_labels = test_ds["relevant"]  # Assuming the second element is true labels
eval_preds = (test_logits, test_labels)

In [72]:
# 1. Single Reduction
import numpy as np
single_reduction_metrics = calculate_cluster_metrics(
    eval_predictions=eval_preds,
    cluster_assignments=train_labels,
    class_names=class_labels.names,
)
for key, value in single_reduction_metrics.items():
    print(f"  {key}: {value:.4f}")

  cluster_-1_accuracy: 0.8424
  cluster_-1_macro_f1: 0.8341
  cluster_-1_macro_precision: 0.8308
  cluster_-1_macro_recall: 0.8385
  cluster_-1_Irrelevant_precision: 0.7727
  cluster_-1_Irrelevant_recall: 0.8226
  cluster_-1_Irrelevant_f1: 0.7969
  cluster_-1_Irrelevant_support: 62.0000
  cluster_-1_Relevant_precision: 0.8889
  cluster_-1_Relevant_recall: 0.8544
  cluster_-1_Relevant_f1: 0.8713
  cluster_-1_Relevant_support: 103.0000
  cluster_-1_support: 165.0000
  cluster_0_accuracy: 1.0000
  cluster_0_macro_f1: 1.0000
  cluster_0_macro_precision: 1.0000
  cluster_0_macro_recall: 1.0000
  cluster_0_Irrelevant_precision: 0.0000
  cluster_0_Irrelevant_recall: 0.0000
  cluster_0_Irrelevant_f1: 0.0000
  cluster_0_Irrelevant_support: 0.0000
  cluster_0_Relevant_precision: 1.0000
  cluster_0_Relevant_recall: 1.0000
  cluster_0_Relevant_f1: 1.0000
  cluster_0_Relevant_support: 13.0000
  cluster_0_support: 13.0000
  cluster_1_accuracy: 0.8548
  cluster_1_macro_f1: 0.8021
  cluster_1_macro_pr

In [57]:
import numpy as np
from typing import Dict, Any, Tuple, List
from datasets import Dataset

# --- Helper Function for Mode Calculation ---

def _get_mode(preds: np.ndarray) -> Tuple[Any, int]:
    """Calculates the mode (most frequent label) and its count."""
    if len(preds) == 0:
        return None, 0
    counts = np.bincount(preds)
    mode_label = np.argmax(counts)
    mode_count = counts[mode_label]
    return int(mode_label), int(mode_count)

# --- Main Diagnostic Function ---

def analyze_cluster_predictions_and_suggest_corrections(
    dataset: Dataset,
    cluster_labels: np.ndarray,
    model_predictions: np.ndarray,
) -> Dict[str, List[Dict[str, Any]]]:
    """
    Analyzes predictions and suggests clusters most likely to contain labeling errors
    due to systematic misclassification (semantic bias).

    Args:
        dataset (Dataset): The Hugging Face Dataset object (must contain a 'relevant' column).
        cluster_labels (np.ndarray): 1D array of cluster IDs.
        model_predictions (np.ndarray): 1D array of predicted class IDs.

    Returns:
        Dict[str, List]: A dictionary containing the top clusters suggested for review.
    """

    # 1. Input Validation and Conversion
    if len(dataset) != len(cluster_labels) or len(dataset) != len(model_predictions):
        print("Error: Input array lengths must match the dataset length.")
        return {"suggestions": []}

    cluster_labels = np.asarray(cluster_labels)
    model_predictions = np.asarray(model_predictions)
    true_labels = np.asarray(dataset["relevant"]).astype(int)

    unique_clusters = np.unique(cluster_labels)
    review_candidates = []

    print("=" * 70)
    print("CLUSTER PREDICTION BREAKDOWN BY TRUE LABEL")
    print("=" * 70)

    for cluster_id in unique_clusters:
        current_cluster_indices = np.where(cluster_labels == cluster_id)[0]
        cluster_true_labels = true_labels[current_cluster_indices]
        cluster_preds = model_predictions[current_cluster_indices]
        cluster_support = len(cluster_true_labels)

        if cluster_support == 0:
            continue

        # 2. Split by True Label and Determine Majority/Minority
        rel_mask = (cluster_true_labels == 1)
        irrel_mask = (cluster_true_labels == 0)

        rel_count = np.sum(rel_mask)
        irrel_count = np.sum(irrel_mask)

        if rel_count >= irrel_count:
            majority_true_label = 1
            minority_true_label = 0
            majority_preds = cluster_preds[rel_mask]
            minority_preds = cluster_preds[irrel_mask]
        else:
            majority_true_label = 0
            minority_true_label = 1
            majority_preds = cluster_preds[irrel_mask]
            minority_preds = cluster_preds[rel_mask]

        # Get modes for logging
        maj_mode_label, maj_mode_count = _get_mode(majority_preds)
        min_mode_label, min_mode_count = _get_mode(minority_preds)

        # 3. Log Diagnostic Output
        print(f"\nCLUSTER {cluster_id}, TOTAL SAMPLES: {cluster_support}")
        print(f"\tTRUE MAJORITY (Label {majority_true_label}) COUNT: {len(majority_preds)}")
        if len(majority_preds) > 0:
            print(f"\t\tPREDICTED MODE: {maj_mode_label} (Ratio: {maj_mode_count / len(majority_preds):.2f})")

        print(f"\tTRUE MINORITY (Label {minority_true_label}) COUNT: {len(minority_preds)}")
        if len(minority_preds) > 0:
            print(f"\t\tPREDICTED MODE: {min_mode_label} (Ratio: {min_mode_count / len(minority_preds):.2f})")
        print("-" * 70)

        # 4. --- Calculate Misclassification Severity Score (MSS) ---

        MIN_MINOR_COUNT = 5 # Require at least 5 minority samples for meaningful calculation
        if len(minority_preds) < MIN_MINOR_COUNT:
            continue

        # The core bias check: how many minority samples were misclassified as the majority label?
        misclassified_as_majority = np.sum(minority_preds == majority_true_label)
        misclassification_ratio = misclassified_as_majority / len(minority_preds)

        # Misclassification Severity Score (MSS): Weights the ratio by the square root of the count
        # This penalizes high misclassification on larger minority groups.
        mss_score = misclassification_ratio * np.sqrt(len(minority_preds))

        if mss_score > 0:
            review_candidates.append({
                'cluster_id': cluster_id,
                'mss_score': mss_score,
                'misclassification_ratio': misclassification_ratio,
                'minority_count': len(minority_preds),
                'minority_true_label': minority_true_label,
                'majority_true_label': majority_true_label,
            })

    # 5. Suggest Top Candidates
    review_candidates.sort(key=lambda x: x['mss_score'], reverse=True)
    top_suggestions = review_candidates[:5]

    print("\n" + "=" * 70)
    print("TOP 5 CLUSTERS SUGGESTED FOR LABELING REVIEW (Highest MSS Score)")
    print("=" * 70)

    if not top_suggestions:
        print("No clusters met the minimum criteria for high-priority review.")

    for i, item in enumerate(top_suggestions):
        print(f"RANK {i+1}: CLUSTER {item['cluster_id']}")
        print(f"  - Minority Label: {item['minority_true_label']} (Count: {item['minority_count']})")
        print(f"  - Majority Label: {item['majority_true_label']}")
        print(f"  - Misclassification Ratio (Minority predicted as Majority): {item['misclassification_ratio']:.2f}")
        print(f"  - Severity Score (MSS): {item['mss_score']:.2f}")
        print("  => Action: Extract the minority samples (True Label 1 or 0) for this cluster and manually check their labels.")

    return {"suggestions": top_suggestions}

In [59]:
suggestions = analyze_cluster_predictions_and_suggest_corrections(
    test_ds,
    train_labels,
    predicted_class_ids
)


CLUSTER PREDICTION BREAKDOWN BY TRUE LABEL

CLUSTER -1, TOTAL SAMPLES: 165
	TRUE MAJORITY (Label 1) COUNT: 103
		PREDICTED MODE: 1 (Ratio: 0.85)
	TRUE MINORITY (Label 0) COUNT: 62
		PREDICTED MODE: 0 (Ratio: 0.82)
----------------------------------------------------------------------

CLUSTER 0, TOTAL SAMPLES: 13
	TRUE MAJORITY (Label 1) COUNT: 13
		PREDICTED MODE: 1 (Ratio: 1.00)
	TRUE MINORITY (Label 0) COUNT: 0
----------------------------------------------------------------------

CLUSTER 1, TOTAL SAMPLES: 124
	TRUE MAJORITY (Label 1) COUNT: 90
		PREDICTED MODE: 1 (Ratio: 0.94)
	TRUE MINORITY (Label 0) COUNT: 34
		PREDICTED MODE: 0 (Ratio: 0.62)
----------------------------------------------------------------------

CLUSTER 2, TOTAL SAMPLES: 20
	TRUE MAJORITY (Label 1) COUNT: 20
		PREDICTED MODE: 1 (Ratio: 1.00)
	TRUE MINORITY (Label 0) COUNT: 0
----------------------------------------------------------------------

CLUSTER 3, TOTAL SAMPLES: 28
	TRUE MAJORITY (Label 1) COUNT: 15
		

In [69]:
suggestions["suggestions"]

[{'cluster_id': np.int32(22),
  'mss_score': np.float64(7.498348166444742),
  'misclassification_ratio': np.float64(0.7117117117117117),
  'minority_count': 111,
  'minority_true_label': 1,
  'majority_true_label': 0},
 {'cluster_id': np.int32(30),
  'mss_score': np.float64(6.46632301492381),
  'misclassification_ratio': np.float64(0.7466666666666667),
  'minority_count': 75,
  'minority_true_label': 0,
  'majority_true_label': 1},
 {'cluster_id': np.int32(24),
  'mss_score': np.float64(6.0),
  'misclassification_ratio': np.float64(0.5),
  'minority_count': 144,
  'minority_true_label': 1,
  'majority_true_label': 0},
 {'cluster_id': np.int32(28),
  'mss_score': np.float64(3.3541019662496847),
  'misclassification_ratio': np.float64(0.75),
  'minority_count': 20,
  'minority_true_label': 0,
  'majority_true_label': 1},
 {'cluster_id': np.int32(27),
  'mss_score': np.float64(3.3129457822453965),
  'misclassification_ratio': np.float64(0.36585365853658536),
  'minority_count': 82,
  'min

In [50]:
import pandas as pd
import numpy as np
import json
# from datasets import Dataset # Uncomment if your test_ds is a Hugging Face Dataset

def prepare_data_for_suite(dataset, cluster_labels, cluster_probabilities, output_filename="data_for_suite.json"):
    """
    Combines the dataset, cluster labels, and probabilities into a single
    JSON file and prints its content to the console.

    Args:
        dataset: Your test_ds (either a Pandas DataFrame or Hugging Face Dataset).
        cluster_labels: The NumPy array of cluster assignments.
        cluster_probabilities: The NumPy array of cluster probabilities.
        output_filename: The name of the JSON file to create.
    """
    print("Starting data preparation...")

    # --- Convert to Pandas DataFrame ---
    try:
        df = dataset.to_pandas()
        print(f"Successfully converted dataset to Pandas. Shape: {df.shape}")
    except Exception as e:
        print(f"Error converting dataset to Pandas: {e}")
        print("Please ensure 'dataset' is a Pandas DataFrame or Hugging Face Dataset.")
        return

    # --- Add cluster and probability data ---
    if len(df) != len(cluster_labels) or len(df) != len(cluster_probabilities):
        print("Error: Dataset length does not match cluster labels or probabilities length.")
        print(f"Dataset: {len(df)}, Clusters: {len(cluster_labels)}, Probs: {len(cluster_probabilities)}")
        return

    df['cluster'] = cluster_labels
    df['probability'] = cluster_probabilities

    # --- Add a unique ID and status columns for the app ---
    df['id'] = df.index
    df['original_label'] = df['relevant'] # Preserve the Llama-generated label
    df['status'] = 'uncorrected' # 'uncorrected' or 'corrected'

    # --- Reorder columns for clarity in the app ---
    try:
        cols = ['id', 'cluster', 'probability', 'content', 'relevant', 'original_label', 'status']
        other_cols = [col for col in df.columns if col not in cols]
        final_cols = cols + other_cols
        df = df[final_cols]
    except KeyError as e:
        print(f"Error: The dataset must contain 'content' and 'label' columns. Found: {df.columns}")
        return

    # --- Save to JSON on the kernel's filesystem ---
    json_data = []
    try:
        # We must use .to_dict('records') to ensure JSON compatibility
        json_data = df.to_dict('records')

        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2)

        print(f"\nSuccess! Data prepared and saved to kernel as '{output_filename}'.")
        print(f"Total records: {len(df)}")
    except Exception as e:
        print(f"Error saving JSON file: {e}")
        return

In [52]:
test_ds = test_ds.cast_column("relevant", class_labels)
prepare_data_for_suite(test_ds, train_labels, train_probabilities, output_filename="/data/data_for_suite.json", )

Casting the dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Starting data preparation...
Successfully converted dataset to Pandas. Shape: (5000, 10)

Success! Data prepared and saved to kernel as '/data/data_for_suite.json'.
Total records: 5000


In [None]:
!pwd

In [13]:
from datasets import Dataset, load_dataset
import numpy as np

# --- 1. Load Data ---
corrected_dataset = Dataset.from_json("data_corrected.json")
original_dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
test_ds = original_dataset["test"]

# Assuming corrected_dataset and test_ds are loaded:

# --- 2. Create the Efficient Lookup Dictionary ---
# Filter for only the rows that were actually corrected/changed
# NOTE: corrected_dataset['relevant'] is likely an integer (0 or 1) here.
corrected_only = corrected_dataset.filter(lambda x: x['relevant'] != x.get("original_label", -1))
selected_columns = corrected_only.select_columns(["tweet_id", "relevant"])

# Convert the small correction dataset into a fast dictionary lookup: {tweet_id: new_label_as_INT}
print("Creating fast lookup dictionary for corrected data...")
corrected_data_dict = {
    str(tweet_id): label for tweet_id, label in zip(selected_columns["tweet_id"], selected_columns["relevant"])
}
print(f"Lookup dictionary created with {len(corrected_data_dict)} corrections.")


# --- 3. Efficient Mapping Function (FIXED) ---
def fix_ds_optimized(row):
    """
    Updates the 'relevant' label only if the tweet_id exists in the
    global corrected_data_dict, ensuring the value is cast to a boolean
    to match the original dataset feature type.
    """
    tweet_id = row["tweet_id"]

    # Check for the ID in the dictionary (O(1) operation)
    if tweet_id in corrected_data_dict:
        new_label_int = corrected_data_dict[tweet_id]

        # ðŸ’¥ FIX: Explicitly cast the integer label (0 or 1) to a boolean (False or True)
        # to match the expected feature type of the original dataset column.
        row["relevant"] = bool(new_label_int)

    # If the ID is not found, the original row['relevant'] is kept
    return row

# --- 4. Apply Correction ---
print("Applying label corrections to the test dataset...")
test_ds_corrected = test_ds.map(fix_ds_optimized)

print("Correction complete.")

Creating fast lookup dictionary for corrected data...
Lookup dictionary created with 323 corrections.
Applying label corrections to the test dataset...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Correction complete.


In [15]:
original_dataset["test"] = test_ds_corrected
original_dataset.push_to_hub("tianharjuno/twitter-parse", commit_message="Fixed some labeling with clustering")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/a2652e2a9217b279e00a6826e86b65d0d2c7e9e0', commit_message='Fixed some labeling with clustering', commit_description='', oid='a2652e2a9217b279e00a6826e86b65d0d2c7e9e0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)