In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from tqdm import tqdm
import pandas as pd
import os
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
! pip install gensim








In [3]:
# Path dataset (sesuaikan dengan lokasi dataset Anda di Colab)
DATASET_ROOT = './indosum'

# Buat folder jika dataset belum ada
if not os.path.exists(DATASET_ROOT):
    os.makedirs(DATASET_ROOT)

# Pastikan file dataset diunggah ke folder ini sebelum menjalankan kode
files_id_dir = os.listdir(DATASET_ROOT)
train_files = []

for filename in files_id_dir:
    if 'train' in filename:
        train_files.append(filename)
        
# Fungsi untuk memuat data JSON Lines
def load_file_to_json_list(filename):
    file = os.path.join(DATASET_ROOT, filename)

    data = []
    with open(file, 'r') as f:
        # Read the entire file content
        file_content = f.read()
        
        # Split the content into individual JSON objects
        json_list = file_content.splitlines() 
        
        for json_str in tqdm(json_list, desc=f'Loading data {filename}'):
            # Skip empty lines
            if json_str.strip(): 
                try:
                    d = json.loads(json_str)
                    data.append(d)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    print(f"Problematic JSON string: {json_str}")
                    # You might want to handle the error, e.g., skip the line or try to fix the JSON
                    
    return data

# Fungsi untuk memproses label menjadi string JSON
def label_to_dict_str(label_list):
    label_dict = {}  # key = paragraph_id : value = label list 
    for i, label in enumerate(label_list[:]):
        label_dict[i] = label

    json_str = json.dumps(label_dict)
    num = len(label_dict)
    return json_str, num

# Fungsi untuk memproses paragraph menjadi string JSON
def paragraph_to_dict_str(paragraph_list):
    paragraph_dict = {}  # key = paragraph_id : value = paragraph list 
    for i, paragraph in enumerate(paragraph_list):
        new_paragraph = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            new_paragraph.append(sentence)
        paragraph_dict[i] = new_paragraph

    json_str = json.dumps(paragraph_dict)
    num = len(paragraph_dict)
    return json_str, num
    
# Fungsi untuk mengubah paragraf menjadi string teks
def paragraph_to_text(raw_paragraph_list):
    new_paragraph_list = []
    for i, paragraph in enumerate(raw_paragraph_list):
        paragraph_list = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            paragraph_list.append(sentence)

        new_paragraph = ' '.join(paragraph_list)
        new_paragraph_list.append(new_paragraph)

    paragraph_str = ' '.join(new_paragraph_list)
    return paragraph_str

# Fungsi untuk memproses summary menjadi string JSON
def summary_to_dict_str(summary_list):
    summary_dict = {}  # key = summary_id : value = summary sentence 
    for i, summary in enumerate(summary_list):
        summary_dict[i] = ' '.join(summary)

    json_str = json.dumps(summary_dict)
    num = len(summary_dict)
    return json_str, num
# Fungsi untuk mengubah summary menjadi string teks
def summary_to_text(raw_summary_list):
    summary_list = []
    for i, summary in enumerate(raw_summary_list):
        summary_list.append(' '.join(summary))

    summary_str = ' '.join(summary_list)
    return summary_str

# Fungsi untuk mengubah data JSON
def alter_json_data(json_list_data, filename=''):
    new_json_list = []
    for json_data in tqdm(json_list_data, desc=f'Altering json data {filename}'):
        json_data = json_data.copy()
        json_data['gold_labels'], _ = label_to_dict_str(json_data['gold_labels'])
        json_data['news_text'] = paragraph_to_text(json_data['paragraphs'])
        json_data['paragraphs'], num_paragraph = paragraph_to_dict_str(json_data['paragraphs'])
        json_data['num_of_paragraphs'] = num_paragraph
        json_data['summary_text'] = summary_to_text(json_data['summary'])
        json_data['summary'], num_summary = summary_to_dict_str(json_data['summary'])
        json_data['num_of_summary'] = num_summary

        new_json_list.append(json_data)
    
    return new_json_list

# Fungsi untuk membuat dataset dari JSON Lines
def create_dataset(jsonl):
    header = list(jsonl[0].keys())
    dataset_list = []
    for json_data in jsonl:
        row = []
        for h in header:
            row.append(json_data[h])
        dataset_list.append(row)
    
    return header, dataset_list

# Fungsi untuk membuat dataset dari file JSON Lines
def create_dataset_from_files(file_list):
    df_header = None
    dataset_list = []
    for filename in file_list:
        json_l = load_file_to_json_list(filename)
        new_json_l = alter_json_data(json_l, filename)
        header, dataset_part = create_dataset(new_json_l)
        
        if not df_header: df_header = header
        dataset_list.extend(dataset_part)
        
    df_full = pd.DataFrame().from_records(dataset_list)
    df_full = df_full.rename(columns=dict(enumerate(header)))
    return df_full

# Proses hanya data train
df_train = create_dataset_from_files(train_files)

# Tampilkan hasil
df_train.head()

Loading data train.01.jsonl: 100%|██████████| 14262/14262 [00:01<00:00, 7311.82it/s]
Altering json data train.01.jsonl: 100%|██████████| 14262/14262 [00:00<00:00, 23534.53it/s]
Loading data train.02.jsonl: 100%|██████████| 14263/14263 [00:01<00:00, 7761.50it/s] 
Altering json data train.02.jsonl: 100%|██████████| 14263/14263 [00:00<00:00, 24869.09it/s]
Loading data train.03.jsonl: 100%|██████████| 14290/14290 [00:01<00:00, 8943.40it/s] 
Altering json data train.03.jsonl: 100%|██████████| 14290/14290 [00:00<00:00, 25154.42it/s]
Loading data train.04.jsonl: 100%|██████████| 14272/14272 [00:01<00:00, 14234.76it/s]
Altering json data train.04.jsonl: 100%|██████████| 14272/14272 [00:00<00:00, 24599.56it/s]
Loading data train.05.jsonl: 100%|██████████| 14266/14266 [00:01<00:00, 8248.40it/s] 
Altering json data train.05.jsonl: 100%|██████████| 14266/14266 [00:00<00:00, 24495.40it/s]


Unnamed: 0,category,gold_labels,id,paragraphs,source,source_url,summary,news_text,num_of_paragraphs,summary_text,num_of_summary
0,tajuk utama,"{""0"": [false, true], ""1"": [true, true], ""2"": [...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,"{""0"": [""Jakarta , CNN Indonesia - - Dokter Rya...",cnn indonesia,https://www.cnnindonesia.com/hiburan/201708041...,"{""0"": ""Dokter Lula Kamal yang merupakan selebr...","Jakarta , CNN Indonesia - - Dokter Ryan Thamri...",9,Dokter Lula Kamal yang merupakan selebriti sek...,3
1,teknologi,"{""0"": [false, false, false, false], ""1"": [fals...",1509072914-dua-smartphone-zenfone-baru-tawarka...,"{""0"": [""Selfie ialah salah satu tema terpanas ...",dailysocial.id,https://dailysocial.id/post/dua-smartphone-zen...,"{""0"": ""Asus memperkenalkan \u00a0 ZenFone gene...",Selfie ialah salah satu tema terpanas di kalan...,14,Asus memperkenalkan ZenFone generasi keempat...,3
2,hiburan,"{""0"": [true], ""1"": [true], ""2"": [false, false]...",1510613677-songsong-visit-2020-bengkulu-perkua...,"{""0"": [""Jakarta , CNN Indonesia - - Dinas Pari...",cnn indonesia,https://www.cnnindonesia.com/gaya-hidup/201711...,"{""0"": ""Dinas Pariwisata Provinsi Bengkulu kemb...","Jakarta , CNN Indonesia - - Dinas Pariwisata P...",21,Dinas Pariwisata Provinsi Bengkulu kembali men...,2
3,tajuk utama,"{""0"": [true, true], ""1"": [false, false, false]...",1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,"{""0"": [""Merdeka.com - Indonesia Corruption Wat...",merdeka,https://www.merdeka.com/peristiwa/icw-merasa-a...,"{""0"": ""Indonesia Corruption Watch ( ICW ) memi...",Merdeka.com - Indonesia Corruption Watch ( ICW...,5,Indonesia Corruption Watch ( ICW ) meminta Kom...,2
4,tajuk utama,"{""0"": [false, true], ""1"": [true, true, true], ...",1503039338-pembagian-sepeda-usai-upacara-penur...,"{""0"": [""Merdeka.com - Presiden Joko Widodo ( J...",merdeka,https://www.merdeka.com/peristiwa/usai-upacara...,"{""0"": ""Jokowi memimpin upacara penurunan bende...",Merdeka.com - Presiden Joko Widodo ( Jokowi ) ...,7,Jokowi memimpin upacara penurunan bendera . Us...,5


In [4]:
! pip install sumy






In [5]:
# Inisialisasi T5
t5_tokenizer = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
t5_model = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model = t5_model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
# Fungsi untuk rangkuman dengan LexRank
def summarize_with_lexrank(article, sentence_count=10):
    try:
        parser = PlaintextParser.from_string(article, Tokenizer("indonesian"))
        summarizer = LexRankSummarizer()
        summary = summarizer(parser.document, sentence_count)
        return " ".join([str(sentence) for sentence in summary])
    except Exception as e:
        return article  # Jika gagal, kembalikan artikel asli.

# Fungsi untuk rangkuman dengan T5
def summarize_with_t5(article, max_length, tokenizer, model):
    input_ids = tokenizer.encode(article, return_tensors="pt", truncation=True, max_length=512)
    input_ids = input_ids.to(device)
    summary_ids = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=1,
        repetition_penalty=1,
        length_penalty=1.2,
        early_stopping=True,
        no_repeat_ngram_size=10,
        use_cache=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Gabungan LexRank + T5
def summarize_combined(article, lexrank_sentences=7, t5_max_length=50):
    lexrank_summary = summarize_with_lexrank(article, sentence_count=lexrank_sentences)
    t5_summary = summarize_with_t5(lexrank_summary, max_length=t5_max_length, tokenizer=t5_tokenizer, model=t5_model)
    return t5_summary

# Iterasi untuk dataset
max_steps = 10  # Ubah sesuai kebutuhan
summary_generated = []

for i, row in tqdm(df_train[['id', 'news_text']].head(max_steps).iterrows(), total=max_steps):
    sg = summarize_combined(row['news_text'], lexrank_sentences=7, t5_max_length=100)
    summary_generated.append([row['id'], sg])

# Konversi hasil menjadi DataFrame
df_summary_generated = pd.DataFrame(summary_generated, columns=['id', 'summary_generated'])

# Gabungkan dengan dataset asli
df_train_result = df_train.head(max_steps).merge(df_summary_generated, on='id')

# Evaluasi dengan ROUGE
from evaluate import load
rouge = load('rouge')

results = rouge.compute(
    references=df_train_result['summary_text'].values,
    predictions=df_train_result['summary_generated'].values
)
print(results)

100%|██████████| 10/10 [00:17<00:00,  1.79s/it]


{'rouge1': 0.810521943959986, 'rouge2': 0.7838667383762701, 'rougeL': 0.7964652822394349, 'rougeLsum': 0.7933830466176524}


In [29]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from evaluate import load

# Tentukan perangkat
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load tokenizer dan model
t5_tokenizer = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
t5_model = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
t5_model = t5_model.to(device)

Device: cuda


In [30]:
# **1. Memuat Dataset dari Excel**
file_path = "Dataset Artikel Berita.xlsx"  # Ganti dengan path file Anda
df = pd.read_excel(file_path)
print("Kolom dataset:", df.columns)

# Konversi DataFrame ke dataset Hugging Face
hf_dataset = Dataset.from_pandas(df)

# Split dataset menjadi train dan validation set
hf_dataset = hf_dataset.train_test_split(test_size=0.2)
print(hf_dataset)


Kolom dataset: Index(['No.', 'News_Text', 'Summary_GPT', 'Summary_Website Berita',
       'Summary_Ground Truth', 'Unnamed: 5'],
      dtype='object')
DatasetDict({
    train: Dataset({
        features: ['No.', 'News_Text', 'Summary_GPT', 'Summary_Website Berita', 'Summary_Ground Truth', 'Unnamed: 5'],
        num_rows: 40
    })
    test: Dataset({
        features: ['No.', 'News_Text', 'Summary_GPT', 'Summary_Website Berita', 'Summary_Ground Truth', 'Unnamed: 5'],
        num_rows: 10
    })
})


In [32]:
# **2. Preprocessing Dataset**
def preprocess_data(examples):
    inputs = examples["News_Text"]  # Pastikan nama kolom sesuai
    targets = examples["Summary_Ground Truth"]  # Pastikan nama kolom sesuai
    model_inputs = t5_tokenizer(inputs, max_length=256, truncation=True)
    labels = t5_tokenizer(targets, max_length=100, truncation=True).input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Tokenisasi dataset
tokenized_datasets = hf_dataset.map(preprocess_data, batched=True)

Map: 100%|██████████| 40/40 [00:00<00:00, 473.03 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 422.32 examples/s]


In [10]:
! pip install "accelerate>=0.26.0"






In [11]:
! pip install transformers[torch]






In [12]:
! pip install 'accelerate>={ACCELERATE_MIN_VERSION}'

ERROR: Invalid requirement: "'accelerate": Expected package name at the start of dependency specifier
    'accelerate
    ^


In [13]:
! pip show accelerate
! pip show transformers


Name: accelerate
Version: 1.1.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: C:\Users\Arief M\Documents\belajar\Semester 5\NLP\Project1\project1\Lib\site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 




Name: transformers




Version: 4.46.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\Arief M\Documents\belajar\Semester 5\NLP\Project1\project1\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [50]:
# **3. Parameter Pelatihan**
training_args = Seq2SeqTrainingArguments(
    output_dir="./panggi-finetuned-model",
    per_device_train_batch_size=2,  # Batch kecil
    num_train_epochs=3,  # Tambahkan epoch karena dataset kecil
    evaluation_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5  # Learning rate standar
)
# Data collator untuk padding otomatis
data_collator = DataCollatorForSeq2Seq(tokenizer=t5_tokenizer, model=t5_model)

# **4. Trainer**
trainer = Seq2SeqTrainer(
    model=t5_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=t5_tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [51]:
# **5. Mulai Pelatihan**
trainer.train()

# Simpan model hasil fine-tuning
t5_model.save_pretrained("./t5-finetuned-model")
t5_tokenizer.save_pretrained("./t5-finetuned-model")

# **6. Fungsi untuk Membuat Ringkasan**
def summarize_with_finetuned_t5(article):
    input_ids = t5_tokenizer.encode(article, return_tensors="pt", truncation=True, max_length=512).to(device)
    summary_ids = t5_model.generate(
        input_ids,
        max_length=150,
        num_beams=8,
        early_stopping=True
    )
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


100%|██████████| 20/20 [01:28<00:00,  1.48s/it]

{'loss': 0.0504, 'grad_norm': 0.11433077603578568, 'learning_rate': 4.25e-05, 'epoch': 0.5}



100%|██████████| 20/20 [01:45<00:00,  1.48s/it]

{'loss': 0.0498, 'grad_norm': 1.0343600511550903, 'learning_rate': 3.4166666666666666e-05, 'epoch': 1.0}



[A
[A
                                               

100%|██████████| 20/20 [01:47<00:00,  1.48s/it]
[A
[A

{'eval_loss': 2.8733692169189453, 'eval_runtime': 1.2317, 'eval_samples_per_second': 8.119, 'eval_steps_per_second': 1.624, 'epoch': 1.0}



100%|██████████| 20/20 [02:05<00:00,  1.48s/it]

{'loss': 0.0268, 'grad_norm': 1.5743052959442139, 'learning_rate': 2.5833333333333336e-05, 'epoch': 1.5}



100%|██████████| 20/20 [02:23<00:00,  1.48s/it]

{'loss': 0.1041, 'grad_norm': 2.114365816116333, 'learning_rate': 1.75e-05, 'epoch': 2.0}



[A
[A
                                               

100%|██████████| 20/20 [02:24<00:00,  1.48s/it]
[A
[A

{'eval_loss': 2.9598140716552734, 'eval_runtime': 1.2024, 'eval_samples_per_second': 8.317, 'eval_steps_per_second': 1.663, 'epoch': 2.0}



100%|██████████| 20/20 [02:42<00:00,  1.48s/it]

{'loss': 0.1188, 'grad_norm': 2.0441184043884277, 'learning_rate': 9.166666666666666e-06, 'epoch': 2.5}



100%|██████████| 20/20 [02:58<00:00,  1.48s/it]

{'loss': 0.1185, 'grad_norm': 1.9061323404312134, 'learning_rate': 8.333333333333333e-07, 'epoch': 3.0}



[A
[A

[A[A                                       
                                               
100%|██████████| 20/20 [03:11<00:00,  1.48s/it]
[A
100%|██████████| 60/60 [01:59<00:00,  2.00s/it]


{'eval_loss': 2.9469668865203857, 'eval_runtime': 1.4446, 'eval_samples_per_second': 6.922, 'eval_steps_per_second': 1.384, 'epoch': 3.0}
{'train_runtime': 119.8265, 'train_samples_per_second': 1.001, 'train_steps_per_second': 0.501, 'train_loss': 0.07805097450812658, 'epoch': 3.0}


In [52]:
# **7. Evaluasi dengan ROUGE**
rouge = load('rouge')

# Buat prediksi pada dataset test
references = tokenized_datasets["test"]["Summary_Website Berita"]
predictions = [summarize_with_finetuned_t5(text) for text in tokenized_datasets["test"]["News_Text"]]

# Hitung metrik ROUGE
results = rouge.compute(references=references, predictions=predictions)
print("Hasil evaluasi ROUGE:", results)

Hasil evaluasi ROUGE: {'rouge1': 0.6230124083561563, 'rouge2': 0.5534934899246532, 'rougeL': 0.5855098638270305, 'rougeLsum': 0.5912560926723185}
