https://huggingface.co/docs/transformers/en/model_doc/encoder-decoder

In [None]:
# !pip install accelerate -U



In [None]:
# !pip install evaluate rouge_score transformers[torch]

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=06cdd1e5b6d584356538eff102361597cc2a6fa316700a42288d4295c259d035
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.2 rouge_score-0.1.2


# Model

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, GenerationConfig
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, EarlyStoppingCallback
from datasets import Dataset, DatasetDict, load_metric
import evaluate
import torch
import os
from os import listdir
from os.path import isfile, join
import json
import re
import numpy as np
import pandas as pd

In [None]:
torch.cuda.is_available()

True

In [None]:
# # max_split_size_mb nya bisa dikecilin klo GPU nya OOM
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [None]:
folder_data = ""
train_set = pd.read_csv(folder_data+'summ_train.csv')
valid_set = pd.read_csv(folder_data+'summ_val.csv')
test_set = pd.read_csv(folder_data+'summ_test.csv')

# train_set = train_set.sample(n=3000, random_state=88).reset_index(drop=True)
# valid_set = valid_set.sample(n=1000, random_state=88).reset_index(drop=True)
# test_set = test_set.sample(n=1000, random_state=88).reset_index(drop=True)

In [None]:
train_set.shape, valid_set.shape, test_set.shape

((1488, 3), (318, 3), (320, 3))

In [None]:
train_set.duplicated().sum(), valid_set.duplicated().sum(), test_set.duplicated().sum()

(0, 0, 0)

In [None]:
train_set.head()

Unnamed: 0,File_path,Articles,Summaries
0,entertainment,Spears seeks aborted tour payment..Singer Brit...,The pop star cancelled her Onyx Hotel tour las...
1,sport,Finnan says Irish can win group..Steve Finnan ...,"Switzerland, Ireland, France and Israel are al..."
2,politics,BNP leader Nick Griffin arrested..The leader o...,Mr Griffin is the twelfth man to be arrested f...
3,politics,Boris opposes mayor apology..Ken Livingstone s...,"However, Mr Johnson, who was forced to apologi..."
4,business,Absa and Barclays talks continue..South Africa...,South Africa biggest retail bank Absa has said...


In [None]:
valid_set.head()

Unnamed: 0,File_path,Articles,Summaries
0,tech,California sets fines for spyware..The makers ...,"From 1 January, a new law is being introduced ..."
1,sport,Scots suffer another injury blow..Scotland's b...,"Another potential option, Glasgow flanker Andr..."
2,entertainment,Ethnic producers 'face barriers'..Minority eth...,Minority ethnic led (Mel) production companies...
3,politics,BAA support ahead of court battle..UK airport ...,"""We do not underestimate the scale of the chal..."
4,politics,'Hitler' row over Welsh arts cash..An artist c...,She said the assembly government was not best ...


In [None]:
test_set.head()

Unnamed: 0,File_path,Articles,Summaries
0,entertainment,France set for new Da Vinci novel..French book...,Angels and Demons was written before The Da Vi...
1,politics,Pakistani women 'must not hide'..Hiding women ...,"Speaking in London on Monday, Gen Musharraf sa..."
2,sport,Moya clinches Cup for Spain..Spain won the Dav...,Spain won the Davis Cup for the second time in...
3,business,BBC poll indicates economic gloom..Citizens in...,"In percentage terms, an average of 44% of resp..."
4,sport,Radcliffe will compete in London..Paula Radcli...,"""Boston is definitely a race I want to do at s..."


In [None]:
train_dataset = Dataset.from_pandas(train_set)
valid_dataset = Dataset.from_pandas(valid_set)
test_dataset = Dataset.from_pandas(test_set)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

## Define Model

In [None]:
tokenizer = T5Tokenizer.from_pretrained("minhtoan/t5-finetune-cnndaily-news")
# tokenizer.bos_token = tokenizer.cls_token
# tokenizer.eos_token = tokenizer.sep_token
model = T5ForConditionalGeneration.from_pretrained("minhtoan/t5-finetune-cnndaily-news")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
prefix = ""
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Articles"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["Summaries"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Map:   0%|          | 0/318 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

## Training

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.03,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    # logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True,
)

In [None]:
rouge = evaluate.load('rouge')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = preds[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Directly decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects newline-separated text
    decoded_preds = ["\n".join(decoded_pred.split()) for decoded_pred in decoded_preds]
    decoded_labels = ["\n".join(decoded_label.split()) for decoded_label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
torch.cuda.empty_cache()

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(eval_results)



{'eval_loss': 1.149860143661499, 'eval_rouge1': 0.1749, 'eval_rouge2': 0.0995, 'eval_rougeL': 0.1508, 'eval_rougeLsum': 0.175, 'eval_runtime': 19.8118, 'eval_samples_per_second': 16.152, 'eval_steps_per_second': 2.019}


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.8789,0.569273,0.1904,0.1313,0.1733,0.1906
2,0.6424,0.541738,0.2015,0.1474,0.1861,0.2013
3,0.6227,0.532246,0.2018,0.1482,0.1867,0.2017
4,0.6222,0.527705,0.2034,0.1505,0.1886,0.2031
5,0.5939,0.525248,0.2027,0.1505,0.1879,0.2024


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=465, training_loss=0.6691635152345062, metrics={'train_runtime': 469.6418, 'train_samples_per_second': 15.842, 'train_steps_per_second': 0.99, 'total_flos': 1006604648448000.0, 'train_loss': 0.6691635152345062, 'epoch': 5.0})

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(eval_results)



{'eval_loss': 0.6050890684127808, 'eval_rouge1': 0.1969, 'eval_rouge2': 0.1412, 'eval_rougeL': 0.18, 'eval_rougeLsum': 0.1972, 'eval_runtime': 19.84, 'eval_samples_per_second': 16.129, 'eval_steps_per_second': 2.016, 'epoch': 5.0}


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
tokenizer.push_to_hub("t5_small_news_summ", token="-----------------")
model.push_to_hub("t5_small_news_summ", token="-------------------")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/andreanstev/t5_small_news_summ/commit/d5e089c6098ddbb80556fda41ae0bc338ae299fb', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='d5e089c6098ddbb80556fda41ae0bc338ae299fb', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Save the model
model.save_pretrained("/kaggle/working/t5_indo_sum")

# Save the tokenizer
tokenizer.save_pretrained("/kaggle/working/t5_indo_sum/tokenizer")

('/kaggle/working/t5_indo_sum/tokenizer/tokenizer_config.json',
 '/kaggle/working/t5_indo_sum/tokenizer/special_tokens_map.json',
 '/kaggle/working/t5_indo_sum/tokenizer/spiece.model',
 '/kaggle/working/t5_indo_sum/tokenizer/added_tokens.json')

In [None]:
!zip -r t5_indo_sum.zip /kaggle/working/t5_indo_sum

  pid, fd = os.forkpty()


updating: kaggle/working/t5_indo_sum/ (stored 0%)
updating: kaggle/working/t5_indo_sum/tokenizer/ (stored 0%)
updating: kaggle/working/t5_indo_sum/tokenizer/spiece.model (deflated 49%)
updating: kaggle/working/t5_indo_sum/tokenizer/special_tokens_map.json (deflated 85%)
updating: kaggle/working/t5_indo_sum/tokenizer/added_tokens.json (deflated 83%)
updating: kaggle/working/t5_indo_sum/tokenizer/tokenizer_config.json (deflated 94%)
updating: kaggle/working/t5_indo_sum/model.safetensors (deflated 7%)
updating: kaggle/working/t5_indo_sum/config.json (deflated 48%)
updating: kaggle/working/t5_indo_sum/generation_config.json (deflated 29%)


## Inference

In [None]:
# # Load the tokenizer
# tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/model/bert2gpt_indo_sum/tokenizer")

# # Load the model
# model = EncoderDecoderModel.from_pretrained("/content/drive/MyDrive/model/bert2gpt_indo_sum")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

In [None]:
def clean_article(text):
    # Remove phrases like "Liputan6.com, [City Name]:"
    text = re.sub(r'Liputan6\. com, [A-Za-z\s]+:', '', text)

    # Remove words followed by a date in the format (dd/mm) or (d/m)
    text = re.sub(r'\w+\s*\(\d{1,2}/\d{1,2}\)', '', text)

    # Remove text inside parentheses that follow a specific pattern (e.g., (UPI/Reporter Name)) and optionally with 'dan' conjunction
    text = re.sub(r'\([A-Z]+/[A-Za-z\s]+(?: dan [A-Za-z\s]+)?\)\.', '', text)

    # Remove text inside square brackets that starts with 'baca:'
    text = re.sub(r'\[baca: .*?\]', '', text)

    # Remove URLs starting with http or https
    text = re.sub(r'https?://\S+', '', text)

    # Remove leading and trailing whitespaces from the text
    return text.strip()

In [None]:
%%time
ARTICLE_TO_SUMMARIZE = """Bank Mandiri, sebagaimana bank umumnya, menyediakan layanan kartu debit bagi nasabahnya. Kartu debit Mandiri dapat digunakan oleh nasabah untuk melakukan berbagai transaksi di mesin ATM atau mesin EDC. Fungsi dari Kartu Debit Mandiri ini sangat beragam, mulai dari tarik tunai, setor tunai, transfer uang, cek saldo rekening, hingga membayar berbagai tagihan melalui mesin ATM.
Penting bagi Anda yang ingin membuka rekening tabungan di Bank Mandiri untuk memahami jenis Kartu Debit Mandiri agar tidak salah memilih. Setiap kartu debit Mandiri memiliki kelebihan dan kekurangannya masing-masing, sehingga penting bagi nasabah untuk memilih yang sesuai dengan kebutuhan dan preferensi mereka.

Dalam memilih jenis Kartu Debit Mandiri, nasabah perlu mempertimbangkan kebutuhan dan gaya hidup mereka. Apakah mereka membutuhkan manfaat tambahan seperti asuransi atau akses ke airport lounge, ataukah mereka menginginkan kartu debit yang sederhana namun praktis. Dengan mengetahui jenis Kartu Debit Mandiri yang sesuai, nasabah dapat memaksimalkan manfaat yang mereka dapatkan dari penggunaan kartu debit tersebut.
"""

# generate summary
input_ids = tokenizer.encode(clean_article(ARTICLE_TO_SUMMARIZE), return_tensors='pt')
summary_ids = model.generate(input_ids.to(model.device),
            min_length=20,
            max_length=128,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.1,
            top_k = 50,
            top_p = 0.95)
# start time dan end time
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary_text)

Kartu debit Mandiri dapat digunakan oleh nasabah untuk melakukan berbagai transaksi di mesin ATM atau mesin EDC.
CPU times: user 884 ms, sys: 36.5 ms, total: 920 ms
Wall time: 1.06 s


In [None]:
# parent_folder = "/content/" # Pake yang ada di folder "clean_data" di Google drive
# test_set = pd.read_csv(parent_folder+'final_test_set.csv')

In [None]:
df_sample = test_set.sample(1)

In [None]:
df_sample["Articles"].values

array(['Blair sets date for Africa report..The Commission for Africa\'s report will be released on 11 March - Comic Relief day, Tony Blair has said...July\'s G8 summit in Gleneagles in Perthshire - chaired by the prime minister - will use the report as the basis for talks on Africa. The announcement followed the final meeting of the commission - which includes singer Bob Geldof - in London. As well as more aid, fairer trade and less debt, the commission is likely to demand action on corruption in Africa. Mr Blair told a news conference: "It will be a report that\'s brutally frank about the reality, but I hope idealistic about what can be done if the will is there. "It\'s an ambitious project we have set ourselves and you will have to judge on its outcome when we publish it."..Mr Blair has vowed to put Africa at the top of his agenda during his time at the helm of the G8. He acknowledged he would have a "a job of persuading to do" on other nations to get the necessary commitment to debt

In [None]:
df_sample["Summaries"].values

array(["The Commission for Africa's report will be released on 11 March - Comic Relief day, Tony Blair has said.As well as more aid, fairer trade and less debt, the commission is likely to demand action on corruption in Africa.Bob Geldof, in characteristically blunt style, promised that the commission would not just be a talking shop but would deliver radical new thinking to change direction for Africa.Mr Blair has vowed to put Africa at the top of his agenda during his time at the helm of the G8.The former rock star's presence on the commission has been interpreted as a sign that it will be uncompromising in its demands."],
      dtype=object)

In [None]:
%%time
ARTICLE_TO_SUMMARIZE = """Blair sets date for Africa report..The Commission for Africa\'s report will be released on 11 March - Comic Relief day, Tony Blair has said...July\'s G8 summit in Gleneagles in Perthshire - chaired by the prime minister - will use the report as the basis for talks on Africa. The announcement followed the final meeting of the commission - which includes singer Bob Geldof - in London. As well as more aid, fairer trade and less debt, the commission is likely to demand action on corruption in Africa. Mr Blair told a news conference: "It will be a report that\'s brutally frank about the reality, but I hope idealistic about what can be done if the will is there. "It\'s an ambitious project we have set ourselves and you will have to judge on its outcome when we publish it."..Mr Blair has vowed to put Africa at the top of his agenda during his time at the helm of the G8. He acknowledged he would have a "a job of persuading to do" on other nations to get the necessary commitment to debt relief. Bob Geldof, in characteristically blunt style, promised that the commission would not just be a talking shop but would deliver radical new thinking to change direction for Africa. The former rock star\'s presence on the commission has been interpreted as a sign that it will be uncompromising in its demands. The people involved include two African government leaders and a range of other African politicians, as well as experts from some other developed countries."""

# generate summary
input_ids = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors='pt')
summary_ids = model.generate(input_ids.to(model.device),
            min_length=20,
            max_length=256,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.01,
            top_k = 50,
            top_p = 0.95)
# start time dan end time
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary_text)

The Commission for Africa's report will be released on 11 March - Comic Relief day, Tony Blair has said.Mr Blair told a news conference: "It will become... an ambitious project we have set ourselves and you will have to judge on its outcome when we publish it."The announcement followed the final meeting
CPU times: total: 1.47 s
Wall time: 1.47 s
