In [2]:
from datasets import load_dataset, ClassLabel
ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
train_ds = ds["sampled_20000_labeled"]
test_ds = ds["test"]
# Split into 80% train, 20% temp (for eval + test)
train_ds_split= train_ds.train_test_split(test_size=0.2, seed=42)

train_ds = train_ds_split["train"]
eval_ds = train_ds_split["test"]

print(train_ds, eval_ds, test_ds)

print(f"Train: {len(train_ds)}, Eval: {len(eval_ds)}, Test: {len(test_ds)}")

class_labels = ClassLabel(names=list(set(train_ds["related"])))

train_ds = train_ds.cast_column("related", class_labels)
eval_ds = eval_ds.cast_column("related", class_labels)
test_ds = test_ds.cast_column("related", class_labels)

cleaned_train_ds = train_ds.select_columns(["content", "related"])
cleaned_test_ds = test_ds.select_columns(["content", "related"])
cleaned_eval_ds = eval_ds.select_columns(["content", "related"])
cleaned_train_ds = cleaned_train_ds.rename_column("related", "label")
cleaned_eval_ds = cleaned_eval_ds.rename_column("related", "label")
cleaned_test_ds = cleaned_test_ds.rename_column("related", "label")

cleaned_train_ds = cleaned_train_ds.rename_column("content", "text")
cleaned_eval_ds = cleaned_eval_ds.rename_column("content", "text")
cleaned_test_ds = cleaned_test_ds.rename_column("content", "text")

Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'related', 'confidence'],
    num_rows: 15998
}) Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'related', 'confidence'],
    num_rows: 4000
}) Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'related', 'confidence'],
    num_rows: 1999
})
Train: 15998, Eval: 4000, Test: 1999


Casting the dataset:   0%|          | 0/15998 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1999 [00:00<?, ? examples/s]

In [3]:
print(cleaned_train_ds, cleaned_eval_ds, cleaned_test_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 15998
}) Dataset({
    features: ['text', 'label'],
    num_rows: 4000
}) Dataset({
    features: ['text', 'label'],
    num_rows: 1999
})


In [4]:
from collections import Counter

# Extract all labels
labels = cleaned_train_ds["label"]

# Count occurrences
label_counts = Counter(labels)

num_false = label_counts.get(0, 0)  # count of label 0 / False
num_true = label_counts.get(1, 0)   # count of label 1 / True

print(f"False: {num_false}, True: {num_true}")


False: 7799, True: 8199


In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

device = torch.device("cuda")

# Load model & tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "indolem/indobertweet-base-uncased",
    cache_dir="cache/"
)
tokenizer = AutoTokenizer.from_pretrained(
    "indolem/indobertweet-base-uncased",
    cache_dir="cache/"
)

# Move model to device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
def tokenize(key):
    def callback(row):
        return tokenizer(
            row[key],
            truncation=True,
            padding="max_length",
            max_length=128,
        )
    return callback

In [7]:
tokenizer_callback = tokenize("text")
encoded_train_ds = cleaned_train_ds.map(tokenizer_callback, batch_size=256, batched=True)
encoded_eval_ds = cleaned_eval_ds.map(tokenizer_callback, batch_size=256, batched=True)
encoded_test_ds = cleaned_test_ds.map(tokenizer_callback, batch_size=256, batched=True)

Map:   0%|          | 0/15998 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

In [8]:
encoded_train_ds.set_format("torch", columns=["label", "input_ids", "attention_mask"])
encoded_eval_ds.set_format("torch", columns=["label", "input_ids", "attention_mask"])
encoded_test_ds.set_format("torch", columns=["label", "input_ids", "attention_mask"])


In [9]:
from transformers import Trainer
from torch import nn

class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        weights = self.class_weights.to(logits.device)
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [10]:
from transformers.training_args import TrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback
from transformers.trainer import Trainer
from transformers.data.data_collator import default_data_collator
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(class_names):
    num_classes = len(class_names)
    def callback(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, torch.Tensor):
            logits = logits.detach().cpu().numpy()
        if isinstance(labels, torch.Tensor):
            labels = labels.detach().cpu().numpy()
        preds = np.argmax(logits, axis=1)
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            labels, preds, average="macro", zero_division=0
        )
        acc = accuracy_score(labels, preds)
        p_cls, r_cls, f1_cls, support_cls = precision_recall_fscore_support(
            labels, 
            preds, 
            average=None,
            zero_division=0,
            labels=list(range(num_classes))
        )
        metrics = {
            "accuracy": acc,
            "macro_f1": macro_f1,
            "macro_precision": macro_p,
            "macro_recall": macro_r,
        }
        for idx, name in enumerate(class_names):
            metrics[f"{name}_precision"] = p_cls[idx] #type: ignore
            metrics[f"{name}_recall"]    = r_cls[idx]  #type: ignore
            metrics[f"{name}_f1"]        = f1_cls[idx]  #type: ignore
            metrics[f"{name}_support"]   = int(support_cls[idx])  #type: ignore
        return metrics
    return callback


training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    eval_strategy="epoch",     # evaluate at the end of each epoch
    save_strategy="epoch",           # save checkpoint at the end of each epoch
    learning_rate=1e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    num_train_epochs=40,
    weight_decay=0.05,
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_f1",
    greater_is_better=True,
    warmup_steps=500,
    fp16=True,
    gradient_accumulation_steps=1,
)

def train_model(
    model,
    train_dataset,
    eval_dataset,
    test_dataset,
    input_class_names,
    class_weights
):
    compute_callback = compute_metrics(input_class_names)
    trainer = WeightedLossTrainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset=eval_dataset,
        data_collator=default_data_collator,
        compute_metrics=compute_callback,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        class_weights=class_weights
    )
    print("Starting training...")
    trainer.train()
    print("Training complete. Evaluating...")
    return trainer.evaluate(eval_dataset = test_dataset)

In [11]:
import os
num_false = label_counts.get(False, 0)
num_true = label_counts.get(True, 0)
total = num_false + num_true

my_weights = torch.tensor([num_true / total,  # weight for class 0 (False)
                           num_false / total], # weight for class 1 (True)
                           dtype=torch.float)
results = train_model(model, encoded_train_ds, encoded_eval_ds, encoded_test_ds, class_labels.names, my_weights)

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall,False Precision,False Recall,False F1,False Support,True Precision,True Recall,True F1,True Support
1,0.6348,0.521502,0.78,0.779232,0.784345,0.780184,0.748439,0.841525,0.792257,1994,0.82025,0.718843,0.766206,2006
2,0.4544,0.406331,0.822,0.821986,0.822174,0.822034,0.813908,0.833501,0.823588,1994,0.830439,0.810568,0.820383,2006
3,0.3877,0.381464,0.834,0.833965,0.83439,0.834051,0.82219,0.851053,0.836373,1994,0.846591,0.817049,0.831558,2006
4,0.3508,0.388795,0.83325,0.832923,0.835575,0.833125,0.862767,0.791374,0.82553,1994,0.808383,0.874875,0.840316,2006
5,0.3184,0.370327,0.84525,0.845248,0.845249,0.845248,0.844957,0.844534,0.844745,1994,0.845541,0.845962,0.845751,2006
6,0.2833,0.374255,0.84125,0.841226,0.84155,0.841294,0.830657,0.856068,0.843171,1994,0.852442,0.82652,0.839281,2006
7,0.2454,0.410427,0.83475,0.834396,0.837318,0.834619,0.866008,0.790873,0.826737,1994,0.808628,0.878365,0.842055,2006
8,0.2113,0.412202,0.8375,0.837482,0.837582,0.837476,0.842159,0.829488,0.835776,1994,0.833006,0.845464,0.839189,2006


Training complete. Evaluating...


In [13]:
import pandas as pd
import numpy as np
import torch
from transformers import Trainer, TrainingArguments
from IPython.display import display, HTML

# We don't need a full training setup, just a place to output predictions.
# This creates a dummy TrainingArguments object.
training_args = TrainingArguments(
    output_dir="./temp_trainer_results",
    per_device_eval_batch_size=64, # Use a large batch size for fast eval
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
)

print("Running predictions on test_ds...")

# --- 2. Run Predictions ---
# The .predict() method runs inference on test_ds and returns a PredictionOutput object
prediction_output = trainer.predict(encoded_test_ds)

print("Predictions complete.")

# The predictions are logits (raw model scores). We need the class index (0 or 1).
predicted_labels = np.argmax(prediction_output.predictions, axis=-1)

# The true labels are also in the output, just to be safe
true_labels = prediction_output.label_ids

# --- 3. Create DataFrame for Review ---

# Get the original text from the dataset
# This is why your test_ds *must* have the 'text' column
try:
    original_texts = encoded_test_ds['text']
except KeyError:
    print("="*50)
    print("ERROR: Your 'test_ds' does not have a 'text' column.")
    print("Please reload your dataset without removing the 'text' column.")
    print("="*50)
    # Stop execution if the text column is missing
    raise

# Create the main DataFrame
df = pd.DataFrame({
    'text': original_texts,
    'true_label': true_labels,
    'predicted_label': predicted_labels
})

# --- 4. Isolate and Categorize Errors ---

# Filter to get only the rows where the model was wrong
errors_df = df[df['true_label'] != df['predicted_label']].copy()

# Add a new column to categorize the error type
def get_error_type(row):
    if row['true_label'] == 0 and row['predicted_label'] == 1:
        # Model predicted 1 (Relevant), but it was 0 (Not Relevant)
        return 'False Positive (FP)'
    elif row['true_label'] == 1 and row['predicted_label'] == 0:
        # Model predicted 0 (Not Relevant), but it was 1 (Relevant)
        return 'False Negative (FN)'

errors_df['error_type'] = errors_df.apply(get_error_type, axis=1)

print(f"\nFound {len(errors_df)} misclassified samples out of {len(df)} total.")

# --- 5. Display the Wrong Answers for Review ---

# Set pandas display options for better text viewing in the notebook
pd.set_option('display.max_colwidth', 300) # Show more text
pd.set_option('display.max_rows', 100)     # Show more rows

print("\n--- 🔴 FALSE POSITIVES (Model said 'Relevant', but was 'Not Relevant') ---")
display(errors_df[errors_df['error_type'] == 'False Positive (FP)'])

print("\n--- 🔵 FALSE NEGATIVES (Model said 'Not Relevant', but was 'Relevant') ---")
display(errors_df[errors_df['error_type'] == 'False Negative (FN)'])

Running predictions on test_ds...


Predictions complete.

Found 234 misclassified samples out of 1999 total.

--- 🔴 FALSE POSITIVES (Model said 'Relevant', but was 'Not Relevant') ---


Unnamed: 0,text,true_label,predicted_label,error_type
2,"di ruu kesehatan yang membahas tembakau golongan narkotika tega bgt, sampai mereka unjuk rasa di depan gedung jadi bisa di membedakan dan kita harus dukung wakil demi rakyat #tegaklurus #omnibuskesehatan <url>",0,1,False Positive (FP)
5,setidaknya masih ada hiburan sewaktu indonesia bubar #indonesiagelap,0,1,False Positive (FP)
15,itu kenapa dihadang sih? kan lagi unjuk rasa!,0,1,False Positive (FP)
24,teruslah bersuara! #sampaimenang #indonesiagelap,0,1,False Positive (FP)
32,"lah wo yang bayar kan orang-orang lu, tiba-tiba ada kerumunan mendukung ruu tni sambil bagi-bagi makanan -,- berapa rate per aksi nya ? 75k ? 100k ?",0,1,False Positive (FP)
55,memang tuntutan demo nya dari awal untuk evaluasi. justru salah satu poin tuntutan demo mahasiswa beberapa hari lalu itu juga untuk mendorong efisiensi yang tepat! yaitu efisiensi kabinet gemuk wowo.,0,1,False Positive (FP)
71,hah?? ruu tni disahkan??,0,1,False Positive (FP)
89,waspada provokasi #indonesiagelap narasi indonesia gelap hanyalah isu propaganda yang bertujuan untuk menyebarkan ketakutan. masa depan indonesia adalah milik mereka yang optimis. #indonesiaterang #waspadahaoksdanprovokasi #lawanhoaks #lawanprovokasi,0,1,False Positive (FP)
90,"ruu tni di sahkan untuk menutup kasus sekar,hmm. kawal terus sekar(at)",0,1,False Positive (FP)
93,"sering, waktu itu bahas bobroknya coretax, warna cat yang bagus buat lebaran, bagaimana cara duit tambahan, mengomongkan dwifungsi abri, harga rumah, kontrakan, proyeksi masa depan, dan masih banyak lagi.",0,1,False Positive (FP)



--- 🔵 FALSE NEGATIVES (Model said 'Not Relevant', but was 'Relevant') ---


Unnamed: 0,text,true_label,predicted_label,error_type
17,perjuangan belum selesai #adilijokowi #indonesiagelap #peringatandarurat,1,0,False Negative (FN)
18,apanya yang memperbaiki citra institusi #indonesiagelap #kamibersamasukatani <url> .,1,0,False Negative (FN)
43,"buat kalian yang turun aksi besok, stay safe ya. semoga semuanya dilancarkan dan aman terkendali! #kawalputusanmk #peringatandarurat #prayforindonesia",1,0,False Negative (FN)
62,al dwifungsi? au sama ada enggak diajak?,1,0,False Negative (FN)
72,#kawanrupawan #presenter #cantik terlihat #menawan untuk suatu acara. #zackiarfan #rupawan #cabutuutni #turunkanprabowo #adilijokowi #indonesiagelap # arfan,1,0,False Negative (FN)
...,...,...,...,...
1898,#indonesiagelap credit tapi owner,1,0,False Negative (FN)
1935,kapolsek lengkong pimpin apel pengamanan aksi unjuk rasa di kantor bawaslu jabar,1,0,False Negative (FN)
1954,kombes pol sugianto marweki pimpin apel pagi: persiapan pengamanan unjuk rasa mengenang lima tahun peristiwa september berdarah #brimobuntukindonesia #brimob_id #poldasultra,1,0,False Negative (FN)
1957,ya ampun. we’re sok back tapi 90’s dictatorship guys. #indonesiagelap,1,0,False Negative (FN)


In [14]:
def calculate_weighted_metrics(eval_metrics):
    """
    Calculates the weighted-average F1, precision, and recall
    from a Hugging Face eval metrics dictionary.
    """
    
    # --- Get F1 and Support ---
    f1_false = eval_metrics.get('eval_False_f1')
    support_false = eval_metrics.get('eval_False_support')
    f1_true = eval_metrics.get('eval_True_f1')
    support_true = eval_metrics.get('eval_True_support')
    
    # --- Get Precision ---
    precision_false = eval_metrics.get('eval_False_precision')
    precision_true = eval_metrics.get('eval_True_precision')
    
    # --- Get Recall ---
    recall_false = eval_metrics.get('eval_False_recall')
    recall_true = eval_metrics.get('eval_True_recall')

    # Check that we have the minimum required keys
    if None in [f1_false, support_false, f1_true, support_true]:
        print("Error: Missing required keys for F1/support.")
        return {}

    # --- Calculate Total Support ---
    total_support = support_false + support_true
    if total_support == 0:
        print("Error: Total support is zero.")
        return {}

    # --- Calculate Weighted Averages ---
    weighted_f1 = ( (f1_false * support_false) + (f1_true * support_true) ) / total_support
    
    weighted_precision = ( (precision_false * support_false) + (precision_true * support_true) ) / total_support
    
    weighted_recall = ( (recall_false * support_false) + (recall_true * support_true) ) / total_support

    return {
        "weighted_f1": weighted_f1,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
        "total_support": total_support
    }

In [15]:
print(calculate_weighted_metrics(results))
print(results)

{'weighted_f1': 0.8831581153878554, 'weighted_precision': 0.8838883617584168, 'weighted_recall': 0.8829414707353677, 'total_support': 1999}
{'eval_loss': 0.2805786430835724, 'eval_accuracy': 0.8829414707353677, 'eval_macro_f1': 0.8815715119034598, 'eval_macro_precision': 0.8804210696298987, 'eval_macro_recall': 0.8832486019318759, 'eval_False_precision': 0.8525852585258525, 'eval_False_recall': 0.8857142857142857, 'eval_False_f1': 0.8688340807174888, 'eval_False_support': 875, 'eval_True_precision': 0.908256880733945, 'eval_True_recall': 0.8807829181494662, 'eval_True_f1': 0.8943089430894309, 'eval_True_support': 1124, 'eval_runtime': 0.8375, 'eval_samples_per_second': 2386.857, 'eval_steps_per_second': 9.552, 'epoch': 8.0}


In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from scipy.special import softmax # Or use torch.nn.functional.softmax
import numpy as np
import pandas as pd
from tqdm import tqdm
model.eval()     # Put model in evaluation mode (turns off dropout)

# 3. Prepare your new data
new_texts = ds["cleaned"]["content"]

batch_size = 32  # <-- Adjust this based on your GPU memory. Try 16, 32, or 64.

# --- 5. NEW: Batch Prediction Loop ---
all_probabilities = []
all_predicted_indices = []

print(f"Starting prediction on {len(new_texts)} texts in batches of {batch_size}...")

# Use torch.no_grad() for the whole loop
with torch.no_grad():
    # Loop over the new_texts in chunks
    for i in tqdm(range(0, len(new_texts), batch_size)):
        
        # Get the current batch of texts
        batch_texts = new_texts[i : i + batch_size]
        
        # Tokenize the batch
        inputs = tokenizer(
            batch_texts, 
            padding=True,       # Pad to the longest text *in this batch*
            truncation=True,    # Truncate to model's max length
            max_length=512,     # Explicitly set max length
            return_tensors="pt"
        )
        
        # Move batch to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Run inference
        outputs = model(**inputs)
        
        # Get logits and convert to probabilities
        logits = outputs.logits
        probabilities = softmax(logits.cpu().numpy(), axis=1)
        
        # Get predicted indices
        predicted_indices = np.argmax(probabilities, axis=1)
        
        # Store results
        all_probabilities.append(probabilities)
        all_predicted_indices.append(predicted_indices)

print("Prediction complete.")

# --- 6. Consolidate Results ---
# Combine the results from all batches
all_probabilities = np.concatenate(all_probabilities, axis=0)
all_predicted_indices = np.concatenate(all_predicted_indices, axis=0)

# --- 7. Package into DataFrame ---
results_data = []
for i in range(len(new_texts)):
    results_data.append({
        'text': new_texts[i],
        'predicted_label': class_labels.names[all_predicted_indices[i]],
        'confidence': all_probabilities[i].max(),
        'prob_False': all_probabilities[i][0],
        'prob_True': all_probabilities[i][1]
    })

df = pd.DataFrame(results_data)
print(df)

Starting prediction on 201583 texts in batches of 32...


100%|██████████| 6300/6300 [01:13<00:00, 85.14it/s]


Prediction complete.
                                                                                                                                                                                                                                                            text  \
0                                          tokoh nu dan islam moderat cak menyentil kinerja kejaksaan. munkin kejaksaan kudu membuktikan kasus pertamina berjalan seperti yang awal digembor2kan, bukan cuma sekedar cari simpati aja! #tolakruutni #cabutruutni   
1                                                                                                                                                                                                                       kembalikan tentara ke barak #tolakruutni   
2                                                                                               inilah kenapa instansi yang memperbolehkan penggunaan senjata api itu tidak boleh dicampur adukan denga

In [17]:
cleaned_ds = ds["cleaned"]
predictions_list = df["predicted_label"].tolist()
# 3. Define the function that .map() will run on each row
def update_row(example, idx):
    """
    Takes a single row (example) and its index (idx).
    It replaces the value in 'related' with the new prediction.
    """
    example['related'] = predictions_list[idx]
    return example

# 4. Apply the function to the entire dataset
# .map() returns a new dataset, so we must re-assign it
print("Starting to update 'related' column...")
cleaned_ds = cleaned_ds.map(update_row, with_indices=True)

print("Column updated successfully!")
print(cleaned_ds)

# Check the first row to see the change
# You should now see your string label (e.g., "True" or "False")
# instead of the old boolean (e.g., True or False)
print(cleaned_ds[5])

Starting to update 'related' column...


Map:   0%|          | 0/201583 [00:00<?, ? examples/s]

Column updated successfully!
Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'related', 'confidence'],
    num_rows: 201583
})
{'tweet_id': '1913036803143335954', 'time': '2025-04-18T01:08:11', 'author': '@hutanaksara', 'content': 'beberapa zine yang akan saya bawa saat melapak nanti #tolakruutni #supremasisipil', 'comment_count': 0, 'repost_count': 0, 'like_count': 2, 'view_count': 290, 'related': True, 'confidence': None}


In [18]:
view_ds = cleaned_ds.select_columns(["content", "related"])
for row in view_ds.to_list()[:10]:
    print(row)

{'content': 'tokoh nu dan islam moderat cak menyentil kinerja kejaksaan. munkin kejaksaan kudu membuktikan kasus pertamina berjalan seperti yang awal digembor2kan, bukan cuma sekedar cari simpati aja! #tolakruutni #cabutruutni', 'related': True}
{'content': 'kembalikan tentara ke barak #tolakruutni', 'related': True}
{'content': 'inilah kenapa instansi yang memperbolehkan penggunaan senjata api itu tidak boleh dicampur adukan dengan ranah sipil #tolakruutni #tolakruupolri #supremasisipil', 'related': True}
{'content': 'amazing elephants group - massai mara national park - kenya - مجموعة رائعة من الأفيال - محمية مساي مارا - كينيا - #tolakruutni #espressolab #notebook0616 #beckysangels #素のまんま #lacasadellosfamososcol #parogeneral #보이넥스트도어', 'related': False}
{'content': 'masih enggak menyangka suamiku tentara #tolakruutni', 'related': True}
{'content': 'beberapa zine yang akan saya bawa saat melapak nanti #tolakruutni #supremasisipil', 'related': True}
{'content': 'sebagai mahasiswa hukum

In [19]:
ds["cleaned_labeled"] = cleaned_ds

In [20]:
text_only = ds["cleaned_labeled"]["content"]
label_only = ds["cleaned_labeled"]["related"]

with open("reader.txt", "w") as file:
    for text, label in zip(text_only, label_only):
        file.write(f"TEXT: {text}\n")
        file.write(f"LABEL: {label}\n")
        file.write("========================================================================\n")


In [21]:
ds.push_to_hub("tianharjuno/twitter-parse", commit_description="Labeled cleaned ds")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/9d881a2590e48e8dee2a39a96b86b98982908e53', commit_message='Upload dataset', commit_description='Labeled cleaned ds', oid='9d881a2590e48e8dee2a39a96b86b98982908e53', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)

In [22]:
model.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1", commit_description="Changed training pipeline")
tokenizer.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1", commit_description="Changed training pipeline")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/tianharjuno/ruu-tni-relevancy-classification-p1/commit/faba550a3a6bda6841e4c5115ba87e4f652bb41c', commit_message='Upload tokenizer', commit_description='Changed training pipeline', oid='faba550a3a6bda6841e4c5115ba87e4f652bb41c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tianharjuno/ruu-tni-relevancy-classification-p1', endpoint='https://huggingface.co', repo_type='model', repo_id='tianharjuno/ruu-tni-relevancy-classification-p1'), pr_revision=None, pr_num=None)