# clean the text

In [1]:
from datasets import load_dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
train_ds = dataset["train_sentiment"]
test_ds = dataset["test_sentiment"]

README.md: 0.00B [00:00, ?B/s]

data/test_sentiment-00000-of-00001.parqu(â€¦):   0%|          | 0.00/1.46M [00:00<?, ?B/s]

Generating source_stage_1 split:   0%|          | 0/201583 [00:00<?, ? examples/s]

Generating source_stage_2 split:   0%|          | 0/247820 [00:00<?, ? examples/s]

Generating cleaned split:   0%|          | 0/195952 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/19999 [00:00<?, ? examples/s]

Generating source_labeled split:   0%|          | 0/247820 [00:00<?, ? examples/s]

Generating train_sentiment split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating test_sentiment split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [2]:
import re, unicodedata, jaconv, emoji

_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_WS       = re.compile(r'\s+')
_KUTI_CUT = re.compile(r'(?i)kutipan.*$', re.DOTALL)

# --- (MODIFIED) ---
# Catches "word" + "dari" + "domain.com" -> replaces with "word"
# Changed \w+ to \S+ to include punctuation like '!'
_DARI_URL_ATTACHED = re.compile(r'(\S+)dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# Catches " dari " + "domain.com" -> replaces with empty string
_DARI_URL_SPACED = re.compile(r'\s+dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# --- (NEW) ---
# Catches any word ending in "dari" (e.g., "anarko!dari", "negaradari")
_DARI_STUCK = re.compile(r'(\S+)dari\b', re.I)

def cleantext(row: str):
    text = row["content"] #type: ignore
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    text = text.replace("tanya grok", " ")
    text = text.replace("grokproductivitypasang", " ")
    text = text.replace('\\n', ' ').replace('\\r', ' ')

    # Handle standard URLs first
    text = _URL.sub(' <url> ', text)
    text = text.replace('ini tidak tersedia', ' ')

    text = _MENTION.sub('@USER', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)
    text = _KUTI_CUT.sub('', text)

    # text = _DARI_URL_ATTACHED.sub(r'\1', text)
    # text = _DARI_URL_SPACED.sub('', text)
    # text = _DARI_STUCK.sub(r'\1', text)

    text = _WS.sub(' ', text).strip()
    row["content"] = text #type: ignore
    return row

In [3]:
train_ds = train_ds.map(cleantext, num_proc=12)
test_ds = test_ds.map(cleantext, num_proc=12)

train_ds = train_ds.rename_column("sentiment", "label")
test_ds = test_ds.rename_column("sentiment", "label")

Map (num_proc=12):   0%|          | 0/20000 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/10000 [00:00<?, ? examples/s]

# load the models and tokenize the data

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import ClassLabel
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu")
class_labels = ClassLabel(names=["Negative", "Neutral", "Positive"])
model = AutoModelForSequenceClassification.from_pretrained('indolem/indobertweet-base-uncased', cache_dir="cache/", num_labels=len(class_labels.names))
tokenizer = AutoTokenizer.from_pretrained('indolem/indobertweet-base-uncased', cache_dir="cache/")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
def tokenize(batch):
    return tokenizer(
        batch["content"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
train_ds = train_ds.map(tokenize, batched=True, num_proc=12)
test_ds = test_ds.map(tokenize, batched=True, num_proc=12)

train_ds = train_ds.cast_column("label", class_labels)
test_ds = test_ds.cast_column("label", class_labels)

Map (num_proc=12):   0%|          | 0/20000 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/10000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [6]:
def compute_metrics(class_names):
    num_classes = len(class_names)
    def callback(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, torch.Tensor):
            logits = logits.detach().cpu().numpy()
        if isinstance(labels, torch.Tensor):
            labels = labels.detach().cpu().numpy()
        preds = np.argmax(logits, axis=1)
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            labels, preds, average="macro", zero_division=0
        )
        acc = accuracy_score(labels, preds)
        p_cls, r_cls, f1_cls, support_cls = precision_recall_fscore_support(
            labels,
            preds,
            average=None,
            zero_division=0,
            labels=list(range(num_classes)),
        )
        metrics = {
            "accuracy": acc,
            "macro_f1": macro_f1,
            "macro_precision": macro_p,
            "macro_recall": macro_r,
        }
        for idx, name in enumerate(class_names):
            metrics[f"{name}_precision"] = p_cls[idx]  # type: ignore
            metrics[f"{name}_recall"] = r_cls[idx]  # type: ignore
            metrics[f"{name}_f1"] = f1_cls[idx]  # type: ignore
            metrics[f"{name}_support"] = int(support_cls[idx])  # type: ignore
        return metrics
    return callback

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        """
        Args:
            alpha (Tensor, optional): Weights for each class.
            gamma (float, optional): Focusing parameter (default=2.0).
            reduction (str, optional): Specifies the reduction to apply to the output.
        """
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        # 1. Calculate standard Cross Entropy Loss without reduction (to get per-sample loss)
        # Note: We do NOT apply weights here yet, because we need clean p_t for the focal term
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')

        # 2. Get the probability of the true class (p_t)
        pt = torch.exp(-ce_loss)

        # 3. Calculate the Focal term: (1 - p_t)^gamma
        focal_term = (1 - pt) ** self.gamma

        # 4. Combine: loss = focal_term * ce_loss
        loss = focal_term * ce_loss

        # 5. Apply class weights (alpha) if provided
        if self.alpha is not None:
            alpha_t = self.alpha[targets]
            loss = loss * alpha_t

        # 6. Apply reduction (mean or sum)
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

class FocalLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, gamma=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.gamma = gamma

    def compute_loss(self, input_model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = input_model(**inputs)
        logits = outputs.get("logits")

        # Move weights to the correct device (GPU/CPU)
        if self.class_weights is not None:
            weights = self.class_weights.to(logits.device)
        else:
            weights = None

        # Initialize and compute Focal Loss
        loss_fct = FocalLoss(alpha=weights, gamma=self.gamma)
        loss = loss_fct(logits.view(-1, input_model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [10]:
import torch
from collections import Counter

# 1. Get counts
true_labels = train_ds["label"]
label_counts = Counter(true_labels)
n_classes = 3
total_samples = sum(label_counts.values())

# 2. Ensure weights are ordered by class index (0, 1, 2)
# This is critical for CrossEntropyLoss
class_indices = sorted(label_counts.keys()) # Assumes labels are 0, 1, 2
counts = [label_counts[i] for i in range(n_classes)]

# 3. Calculate Balanced Weights
# Formula: Total / (Num_Classes * Count_Class)
weights = [total_samples / (n_classes * c) for c in counts]

my_weights = torch.tensor(weights, dtype=torch.float)

print(f"Class Counts: {counts}")
print(f"Calculated Weights: {my_weights}")

Class Counts: [14500, 3820, 1680]
Calculated Weights: tensor([0.4598, 1.7452, 3.9683])


In [12]:
from transformers.training_args import TrainingArguments
from transformers.data.data_collator import default_data_collator
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
training_args = TrainingArguments(
    overwrite_output_dir=True,
    eval_strategy="epoch",     # evaluate at the end of each epoch
    save_strategy="epoch",           # save checkpoint at the end of each epoch
    learning_rate=1e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_f1",
    greater_is_better=True,
    warmup_ratio=0.01,
    fp16=True,
)
compute_callback = compute_metrics(class_labels.names)
train_ds_split = train_ds.train_test_split(test_size=0.2, seed=42, stratify_by_column="label")
trainer = FocalLossTrainer(
    model = model,
    args = training_args,
    train_dataset = train_ds_split["train"],
    eval_dataset=train_ds_split["test"],
    data_collator=default_data_collator,
    compute_metrics=compute_callback,
    class_weights=my_weights
)
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall,Negative Precision,Negative Recall,Negative F1,Negative Support,Neutral Precision,Neutral Recall,Neutral F1,Neutral Support,Positive Precision,Positive Recall,Positive F1,Positive Support
1,0.2735,0.19266,0.763,0.71347,0.680481,0.798037,0.961504,0.74069,0.836774,2900,0.479938,0.814136,0.603883,764,0.6,0.839286,0.699752,336
2,0.1711,0.172374,0.80725,0.757026,0.72244,0.821569,0.956737,0.80069,0.871785,2900,0.542358,0.812827,0.650602,764,0.668224,0.85119,0.748691,336
3,0.1382,0.165579,0.819,0.769977,0.737196,0.830124,0.960816,0.811724,0.88,2900,0.560526,0.836387,0.671218,764,0.690244,0.842262,0.758713,336
4,0.117,0.169035,0.83975,0.783381,0.747637,0.836954,0.954246,0.848621,0.898339,2900,0.61274,0.793194,0.691386,764,0.675926,0.869048,0.760417,336
5,0.0973,0.192012,0.828,0.784357,0.756352,0.839899,0.968124,0.816897,0.886104,2900,0.565329,0.866492,0.684238,764,0.735602,0.83631,0.78273,336
6,0.0821,0.198228,0.84375,0.797144,0.772081,0.840653,0.961871,0.843793,0.898971,2900,0.596878,0.850785,0.701565,764,0.757493,0.827381,0.790896,336
7,0.0734,0.208628,0.848,0.798954,0.771053,0.841851,0.957817,0.853448,0.902626,2900,0.61256,0.829843,0.704836,764,0.742782,0.842262,0.7894,336
8,0.0678,0.226386,0.84225,0.801372,0.780922,0.844315,0.966122,0.835862,0.896284,2900,0.585814,0.875654,0.701994,764,0.790831,0.821429,0.805839,336
9,0.062,0.226026,0.851,0.803649,0.780515,0.840773,0.957258,0.857241,0.904493,2900,0.614203,0.837696,0.708749,764,0.770083,0.827381,0.797704,336
10,0.0582,0.22909,0.851,0.804875,0.781972,0.842701,0.958639,0.855172,0.903955,2900,0.612903,0.84555,0.710671,764,0.774373,0.827381,0.8,336


TrainOutput(global_step=1250, training_loss=0.11406148338317872, metrics={'train_runtime': 252.8152, 'train_samples_per_second': 632.873, 'train_steps_per_second': 4.944, 'total_flos': 1.052453670912e+16, 'train_loss': 0.11406148338317872, 'epoch': 10.0})

In [13]:
trainer.evaluate(test_ds)

{'eval_loss': 0.21958070993423462,
 'eval_accuracy': 0.8436,
 'eval_macro_f1': 0.8018332281076606,
 'eval_macro_precision': 0.7843743480557018,
 'eval_macro_recall': 0.8280194155243551,
 'eval_Negative_precision': 0.9400246685168054,
 'eval_Negative_recall': 0.8574040219378428,
 'eval_Negative_f1': 0.896815474001618,
 'eval_Negative_support': 7111,
 'eval_Neutral_precision': 0.6296433878157504,
 'eval_Neutral_recall': 0.8052256532066508,
 'eval_Neutral_f1': 0.7066916823014384,
 'eval_Neutral_support': 2105,
 'eval_Positive_precision': 0.7834549878345499,
 'eval_Positive_recall': 0.8214285714285714,
 'eval_Positive_f1': 0.8019925280199253,
 'eval_Positive_support': 784,
 'eval_runtime': 4.8029,
 'eval_samples_per_second': 2082.058,
 'eval_steps_per_second': 8.328,
 'epoch': 10.0}

In [14]:
trainer.model.push_to_hub("tianharjuno/ruu-tni-sentiment-classification", commit_message="Initial Commit")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpkzhsgd63/model.safetensors    :   0%|          |  557kB /  442MB            

CommitInfo(commit_url='https://huggingface.co/tianharjuno/ruu-tni-sentiment-classification/commit/104bbd3485e036bd1d891174910cdd472f9b4497', commit_message='Initial Commit', commit_description='', oid='104bbd3485e036bd1d891174910cdd472f9b4497', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tianharjuno/ruu-tni-sentiment-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='tianharjuno/ruu-tni-sentiment-classification'), pr_revision=None, pr_num=None)

In [15]:
tokenizer.push_to_hub("tianharjuno/ruu-tni-sentiment-classification", commit_message="Initial Commit")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/tianharjuno/ruu-tni-sentiment-classification/commit/104bbd3485e036bd1d891174910cdd472f9b4497', commit_message='Initial Commit', commit_description='', oid='104bbd3485e036bd1d891174910cdd472f9b4497', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tianharjuno/ruu-tni-sentiment-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='tianharjuno/ruu-tni-sentiment-classification'), pr_revision=None, pr_num=None)

In [21]:
whole_label_ds = dataset["source_labeled"]
related_ds = whole_label_ds.filter(lambda x: x["relevant"] == True)
related_ds = related_ds.map(cleantext, num_proc=20)
related_ds = related_ds.map(tokenize, num_proc=20, batched=True, batch_size=128)

Map (num_proc=20):   0%|          | 0/147701 [00:00<?, ? examples/s]

In [22]:
predictions = trainer.predict(related_ds)

In [23]:
predicted_labels = predictions.predictions.argmax(axis=1)

In [29]:
# 1. Create a lookup dictionary: { tweet_id_string : predicted_label_int }
# We cast IDs to string to prevent the Int64 vs String mismatch errors you saw earlier.
# Accessing column by name related_ds['tweet_id'] is faster than iterating rows.
relevant_ids = [str(x) for x in related_ds['tweet_id']]
prediction_map = dict(zip(relevant_ids, predicted_labels))

print(f"Mapped {len(prediction_map)} predictions.")

# 2. Define the merging function
def apply_predictions(example):
    t_id = str(example['tweet_id'])

    # If the tweet was in our relevant batch, assign the prediction.
    # If not (or if relevant=False), assign -1.
    example['sentiment'] = int(prediction_map.get(t_id, -1))

    return example

# 3. Apply to the WHOLE dataset
# This effectively "resets" everyone to -1, then selectively updates the relevant ones.
final_ds = whole_label_ds.map(apply_predictions)

# Verification
print("Label counts in final dataset:")
from collections import Counter
print(Counter(final_ds['sentiment']))

Mapped 147701 predictions.


Map:   0%|          | 0/247820 [00:00<?, ? examples/s]

Label counts in final dataset:
Counter({0: 104955, -1: 100119, 1: 31335, 2: 11411})


In [31]:
dataset["source_labeled"] = final_ds
dataset.push_to_hub("tianharjuno/twitter-parse", commit_message="Labeled sentiment")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/10a15a58ddb21f02087ecd00dfda85978e3fcc46', commit_message='Labeled sentiment', commit_description='', oid='10a15a58ddb21f02087ecd00dfda85978e3fcc46', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)