In [1]:
!pip install -q transformers datasets sentencepiece accelerate
!pip install --upgrade huggingface_hub datasets
from google.colab import userdata


HF_TOKEN = userdata.get('HF_TOKEN')


!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `HF_TOKEN` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `HF_TOKEN`


In [2]:
#ENV
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)
from collections import defaultdict
import os


os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
MODEL_NAME = "google-t5/t5-base"
DATASET_NAME = "rajpurkar/squad_v2"
OUTPUT_DIR = "t5-base-qg-squadv2-finetuned"

HUB_MODEL_ID = "PlasticTr33s/t5-base-multi-qg-squadv2"

# Parameter
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 5e-5
NUM_TRAIN_EPOCHS = 4

In [4]:
#Load Data
print("Memuat dataset mentah dan tokenizer...")
dataset = load_dataset(DATASET_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# Filter karena ada pertanyaan yang tidak memiliki jawaban
print(f"Ukuran dataset 'train' sebelum filter: {len(dataset['train'])}")
dataset = dataset.filter(lambda example: len(example['answers']['text']) > 0, num_proc=2)
print(f"Ukuran dataset 'train' setelah filter: {len(dataset['train'])}")

# 3.2. Fungsi untuk mengelompokkan semua pertanyaan berdasarkan konteks
def group_questions_by_context(dataset_split):
    print(f"Mengelompokkan pertanyaan untuk split data...")
    context_map = defaultdict(list)
    for example in dataset_split:
        context_map[example['context']].append(example['question'])

    new_examples = {'context': [], 'questions': []}
    for context, questions in context_map.items():
        new_examples['context'].append(context)
        new_examples['questions'].append(" <sep> ".join(questions))

    return Dataset.from_dict(new_examples)

train_dataset_multi = group_questions_by_context(dataset['train'])
validation_dataset_multi = group_questions_by_context(dataset['validation'])

# Preprosesing
prefix = "generate questions: "

def preprocess_function_multi_qg(examples):
    inputs = [prefix + context for context in examples["context"]]
    targets = examples["questions"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed_datasets = DatasetDict({
    'train': train_dataset_multi.map(preprocess_function_multi_qg, batched=True, remove_columns=train_dataset_multi.column_names, num_proc=2),
    'validation': validation_dataset_multi.map(preprocess_function_multi_qg, batched=True, remove_columns=validation_dataset_multi.column_names, num_proc=2)
})

print(f"\nContoh input setelah tokenisasi:\n{tokenizer.decode(processed_datasets['train'][0]['input_ids'], skip_special_tokens=True)}")
print(f"\nContoh label (gabungan) setelah tokenisasi:\n{tokenizer.decode(processed_datasets['train'][0]['labels'], skip_special_tokens=True)}")




Memuat dataset mentah dan tokenizer...


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Ukuran dataset 'train' sebelum filter: 130319


Filter (num_proc=2):   0%|          | 0/130319 [00:00<?, ? examples/s]

Filter (num_proc=2):   0%|          | 0/11873 [00:00<?, ? examples/s]

Ukuran dataset 'train' setelah filter: 86821
Mengelompokkan pertanyaan untuk split data...
Mengelompokkan pertanyaan untuk split data...


Map (num_proc=2):   0%|          | 0/18877 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1204 [00:00<?, ? examples/s]


Contoh input setelah tokenisasi:
generate questions: Beyoncé Giselle Knowles-Carter (/bijnse/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".

Contoh label (gabungan) setelah tokenisasi:
When did Beyonce start becoming popular? sep> What areas did Beyonce compete in when she was growing up? sep> When did Beyonce leave Destiny's Child and become a solo singer? sep> In what city and state did Bey

In [5]:
#Setup training

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to="none",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_ID,
    logging_dir='./logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:

#Start Training
print("\nStart")
trainer.train()
print("Training selesai!")


Start


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,1.7508
200,0.55
300,0.5199
400,0.5221
500,0.5042
600,0.4847
700,0.4894
800,0.4747
900,0.4792
1000,0.4775


Step,Training Loss
100,1.7508
200,0.55
300,0.5199
400,0.5221
500,0.5042
600,0.4847
700,0.4894
800,0.4747
900,0.4792
1000,0.4775


No files have been modified since last commit. Skipping to prevent empty commit.


Training selesai!


In [7]:
#Save Model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Uploading...:   0%|          | 0.00/892M [00:00<?, ?B/s]

('t5-base-qg-squadv2-finetuned/tokenizer_config.json',
 't5-base-qg-squadv2-finetuned/special_tokens_map.json',
 't5-base-qg-squadv2-finetuned/spiece.model',
 't5-base-qg-squadv2-finetuned/added_tokens.json',
 't5-base-qg-squadv2-finetuned/tokenizer.json')