#

## Dependencies and Credentials

In [None]:
#@title Install Depencencies
!pip install torch transformers datasets accelerate wandb evaluate huggingface_hub -q

In [None]:
#@title WandB Login
wandb.login()

In [None]:
#@title Hugging Face Login
!huggingface-cli login

In [None]:
#@title Import Depencencies
import pandas as pd
import torch
import numpy as np
import wandb
from datasets import Dataset, load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from huggingface_hub import HfApi, create_repo
import evaluate

## Prepare Data

In [None]:
#@title Handle Noise and Class Imbalance

def undersample(df, label_col, limit):
    df = df[['text', label_col]].rename(columns={label_col: 'label'})
    if label_col == 'emotion_llm':
        df = df[df['label'] != 'Neutral']
    df_balanced = df.groupby('label', group_keys=False).apply(
        lambda x: x.sample(min(len(x), limit), random_state=42)).sample(frac=1, random_state=42).reset_index(drop=True)
    return df_balanced

In [None]:
#@title Convert Data into Hugging Face Format

def get_dataset_info(dataset):
    label_list = list(set(dataset["label"]))
    id2label = {i: label for i, label in enumerate(label_list)}
    label2id = {label: i for i, label in enumerate(label_list)}
    dataset = Dataset.from_pandas(dataset)
    dataset = dataset.map(lambda x: {"label": label2id[x["label"]]})
    return dataset, label_list, id2label, label2id

# Load Model and Training

In [None]:
#@title Load Pretrained Model

def load_model_and_tokenizer(model_name, label_list, id2label, label2id):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [None]:
#@title Define Training Function

def setup_trainer(model, tokenizer, dataset, run_name):
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    dataset = tokenized_datasets.train_test_split(test_size=0.2)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f"./{run_name}_politics",
        run_name=run_name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=10,
        report_to="wandb"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    return trainer

In [None]:
#@title Define Metrics

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
#@title Training Execution

def train_model(model_name, dataset, run_name):
    wandb.init(project="politics-finetuning", name=run_name)
    dataset, label_list, id2label, label2id = get_dataset_info(dataset)
    model, tokenizer = load_model_and_tokenizer(model_name, label_list, id2label, label2id)
    trainer = setup_trainer(model, tokenizer, dataset, run_name)
    trainer.train()
    trainer.save_model(f"./{run_name}_politics")
    tokenizer.save_pretrained(f"./{run_name}_politics")
    wandb.finish()

# Upload Model to Hugging Face Hub

In [None]:
#@title Upload Model to Hugging Face Hub
def push_to_huggingface(local_model_path, repo_name, private=True):
    create_repo(repo_name, private=private, exist_ok=True)
    api = HfApi()
    api.upload_folder(
        folder_path=local_model_path,
        repo_id=f"rizkydata/{repo_name}",
        commit_message="Uploading fine-tuned model"
    )
    print(f"Model pushed to Hugging Face: https://huggingface.co/rizkydata/{repo_name}")

# Runs

In [None]:
#@title Load Data
id_danantara = "16F7tUU4vJSX4MojgCKkpB6fACOoRJQCv"
id_ihsg = "1-66qhsf_15aT5deplITZxX9V197P8qJ7"

# Read dataset
df_danantara = pd.read_pickle(f"https://drive.google.com/uc?id={id_danantara}")
df_ihsg = pd.read_csv(f"https://drive.google.com/uc?id={id_ihsg}")

# Merge dataset
df = pd.concat([df_danantara[['text', 'sentiment_llm', 'emotion_llm']],
                df_ihsg[['text', 'sentiment_llm', 'emotion_llm']]], ignore_index=True)

In [None]:
#@title Handle Class Imbalance
sentiment_limit = 800
emotion_limit = 250

df_sentiment_balanced = undersample(df, 'sentiment_llm', sentiment_limit)
df_emotion_balanced = undersample(df, 'emotion_llm', emotion_limit)

In [None]:
#@title Train Sentiment Model
sentiment_model_name = "Aardiiiiy/indobertweet-base-Indonesian-sentiment-analysis"
train_model(sentiment_model_name, df_sentiment_balanced, run_name="indobertweet_sentiment")

In [None]:
#@title Train Emotion Model
emotion_model_name = "Aardiiiiy/NusaBERT-base-Indonesian-Plutchik-emotion-analysis-v2"
train_model(emotion_model_name, df_emotion_balanced, run_name="NusaBERT_emotion")

In [None]:
#@title Push Model to Hugging Face Hub
push_to_huggingface("./indobertweet_sentiment_politics", "indobertweet-sentiment-politics", private=False)
push_to_huggingface("./NusaBERT_emotion_politics", "nusabert-emotion-politics", private=False)