In [None]:
# STEP 1: Uninstall lama & install dependensi terbaru yang stabil
!pip uninstall -y transformers -q
!pip uninstall -y peft -q
!pip install -q transformers==4.41.1 datasets==2.14.6 evaluate==0.4.1 sentencepiece==0.2.0
!pip install -q rouge-score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver d

In [None]:
# STEP 2: Import dan setup
import os
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast, Seq2SeqTrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import evaluate
import pandas as pd
from rouge_score import rouge_scorer
from tqdm import tqdm

# Konfigurasi Colab & memori
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load dataset dari Drive
df_train = pd.read_parquet("/content/drive/MyDrive/dataset/train-00000-of-00001.parquet")
df_val = pd.read_parquet("/content/drive/MyDrive/dataset/validation-00000-of-00001.parquet")
df_test = pd.read_parquet("/content/drive/MyDrive/dataset/test-00000-of-00001.parquet")

# Bersihkan dan format ulang kolom
for df in [df_train, df_val, df_test]:
    df.dropna(inplace=True)
    df.rename(columns={'source': 'article', 'target': 'summary'}, inplace=True)

# Batasi training set ke 40.000
df_train = df_train.sample(n=40000, random_state=42)

# Konversi ke Huggingface Dataset
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)


In [None]:
tokenizer = T5TokenizerFast.from_pretrained("/content/drive/MyDrive/model")
tokenizer.pad_token = tokenizer.eos_token  # hindari warning

def preprocess(examples):
    input_texts = ["summarize: " + article for article in examples["article"]]
    model_inputs = tokenizer(
        input_texts,
        max_length=1024,
        truncation=True,
        padding="max_length",
        return_attention_mask=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], # Changed 'target' to 'summary'
            max_length=256,  # 🔥 optimal untuk ringkasan Liputan6
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Tokenisasi dengan pemrosesan paralel
tokenized_train_dataset = train_dataset.map(preprocess, batched=True, num_proc=4, remove_columns=train_dataset.column_names)
tokenized_val_dataset = val_dataset.map(preprocess, batched=True, num_proc=4, remove_columns=val_dataset.column_names)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True, num_proc=4, remove_columns=test_dataset.column_names)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Map (num_proc=4):   0%|          | 0/40000 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=4):   0%|          | 0/10972 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=4):   0%|          | 0/10972 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="/content/t5-liputan6",
    evaluation_strategy="epoch",         # tetap per-epoch agar hemat evaluasi
    save_strategy="epoch",               # simpan model tiap epoch saja
    learning_rate=3e-5,                   # sedikit lebih tinggi supaya konvergen lebih cepat
    per_device_train_batch_size=8,        # KECILKAN batch size untuk hemat RAM
    num_train_epochs=5,                   # KURANGI epoch untuk hemat waktu/biaya
    warmup_steps=250,                     # Sesuaikan warmup agar proporsional
    generation_num_beams=3,               # Kurangi beam search untuk menghemat waktu inference
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,                            # tetap aktifkan half-precision kalau GPU support
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/model")
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model)
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.3453,0.378888
2,0.3453,0.376787
3,0.3386,0.374615
4,0.3365,0.373343
5,0.3372,0.373674


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=25000, training_loss=0.3420798614501953, metrics={'train_runtime': 14457.2557, 'train_samples_per_second': 13.834, 'train_steps_per_second': 1.729, 'total_flos': 5.41367205888e+16, 'train_loss': 0.3420798614501953, 'epoch': 5.0})

In [None]:
# STEP 7: Simpan model dan tokenizer ke Google Drive
output_path = "/content/drive/MyDrive/model"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print("✅ Model berhasil disimpan ke Google Drive:", output_path)



✅ Model berhasil disimpan ke Google Drive: /content/drive/MyDrive/model


In [None]:
# Ambil sampel 200 data test
df_test_sample = df_test.sample(n=200, random_state=42).reset_index(drop=True)
test_dataset = Dataset.from_pandas(df_test_sample)

# Load model
model = T5ForConditionalGeneration.from_pretrained(output_path).to("cuda")
tokenizer = T5TokenizerFast.from_pretrained(output_path)

def generate_summary(text):
    input_text = "summarize: " + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to("cuda")
    summary_ids = model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=128)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Generate prediksi
preds, refs = [], []
for item in tqdm(test_dataset):
    preds.append(generate_summary(item["article"]))
    refs.append(item["summary"])

# Hitung ROUGE Score (Recall, Precision, F1)
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_r = rouge1_p = rouge1_f = 0
rouge2_r = rouge2_p = rouge2_f = 0
rougeL_r = rougeL_p = rougeL_f = 0

for ref, pred in zip(refs, preds):
    score = scorer.score(ref, pred)

    rouge1_r += score['rouge1'].recall
    rouge1_p += score['rouge1'].precision
    rouge1_f += score['rouge1'].fmeasure

    rouge2_r += score['rouge2'].recall
    rouge2_p += score['rouge2'].precision
    rouge2_f += score['rouge2'].fmeasure

    rougeL_r += score['rougeL'].recall
    rougeL_p += score['rougeL'].precision
    rougeL_f += score['rougeL'].fmeasure

n = len(refs)
print("ROUGE Scores (Average over 200 samples):\n")

print(f"ROUGE-1 -> Recall: {rouge1_r/n:.4f}, Precision: {rouge1_p/n:.4f}, F1-score: {rouge1_f/n:.4f}")
print(f"ROUGE-2 -> Recall: {rouge2_r/n:.4f}, Precision: {rouge2_p/n:.4f}, F1-score: {rouge2_f/n:.4f}")
print(f"ROUGE-L -> Recall: {rougeL_r/n:.4f}, Precision: {rougeL_p/n:.4f}, F1-score: {rougeL_f/n:.4f}")

100%|██████████| 200/200 [03:07<00:00,  1.06it/s]

ROUGE Scores (Average over 200 samples):

ROUGE-1 -> Recall: 0.4301, Precision: 0.3623, F1-score: 0.3888
ROUGE-2 -> Recall: 0.2413, Precision: 0.2035, F1-score: 0.2180
ROUGE-L -> Recall: 0.3529, Precision: 0.2972, F1-score: 0.3189



