In [1]:
! pip install pandas scikit-learn numpy sastrawi nltk evaluate tqdm torch nltk gensim





In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import sent_tokenize
import evaluate
from tqdm import tqdm
import torch
import os
import json
import re
from sklearn.cluster import KMeans
from transformers import T5Tokenizer, T5Model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Arief
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Path dataset (sesuaikan dengan lokasi dataset Anda di Colab)
DATASET_ROOT = './indosum'

# Buat folder jika dataset belum ada
if not os.path.exists(DATASET_ROOT):
    os.makedirs(DATASET_ROOT)

# Pastikan file dataset diunggah ke folder ini sebelum menjalankan kode
files_id_dir = os.listdir(DATASET_ROOT)
train_files = []

for filename in files_id_dir:
    if 'train' in filename:
        train_files.append(filename)
        
# Fungsi untuk memuat data JSON Lines
def load_file_to_json_list(filename):
    file = os.path.join(DATASET_ROOT, filename)

    data = []
    with open(file, 'r') as f:
        # Read the entire file content
        file_content = f.read()
        
        # Split the content into individual JSON objects
        json_list = file_content.splitlines() 
        
        for json_str in tqdm(json_list, desc=f'Loading data {filename}'):
            # Skip empty lines
            if json_str.strip(): 
                try:
                    d = json.loads(json_str)
                    data.append(d)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    print(f"Problematic JSON string: {json_str}")
                    # You might want to handle the error, e.g., skip the line or try to fix the JSON
                    
    return data



In [5]:
# Fungsi untuk memproses label menjadi string JSON
def label_to_dict_str(label_list):
    label_dict = {}  # key = paragraph_id : value = label list 
    for i, label in enumerate(label_list[:]):
        label_dict[i] = label

    json_str = json.dumps(label_dict)
    num = len(label_dict)
    return json_str, num


In [6]:
# Fungsi untuk memproses paragraph menjadi string JSON
def paragraph_to_dict_str(paragraph_list):
    paragraph_dict = {}  # key = paragraph_id : value = paragraph list 
    for i, paragraph in enumerate(paragraph_list):
        new_paragraph = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            new_paragraph.append(sentence)
        paragraph_dict[i] = new_paragraph

    json_str = json.dumps(paragraph_dict)
    num = len(paragraph_dict)
    return json_str, num
    
# Fungsi untuk mengubah paragraf menjadi string teks
def paragraph_to_text(raw_paragraph_list):
    new_paragraph_list = []
    for i, paragraph in enumerate(raw_paragraph_list):
        paragraph_list = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            paragraph_list.append(sentence)

        new_paragraph = ' '.join(paragraph_list)
        new_paragraph_list.append(new_paragraph)

    paragraph_str = ' '.join(new_paragraph_list)
    return paragraph_str

In [7]:
# Fungsi untuk memproses summary menjadi string JSON
def summary_to_dict_str(summary_list):
    summary_dict = {}  # key = summary_id : value = summary sentence 
    for i, summary in enumerate(summary_list):
        summary_dict[i] = ' '.join(summary)

    json_str = json.dumps(summary_dict)
    num = len(summary_dict)
    return json_str, num
# Fungsi untuk mengubah summary menjadi string teks
def summary_to_text(raw_summary_list):
    summary_list = []
    for i, summary in enumerate(raw_summary_list):
        summary_list.append(' '.join(summary))

    summary_str = ' '.join(summary_list)
    return summary_str

In [8]:

# Fungsi untuk mengubah data JSON
def alter_json_data(json_list_data, filename=''):
    new_json_list = []
    for json_data in tqdm(json_list_data, desc=f'Altering json data {filename}'):
        json_data = json_data.copy()
        json_data['gold_labels'], _ = label_to_dict_str(json_data['gold_labels'])
        json_data['news_text'] = paragraph_to_text(json_data['paragraphs'])
        json_data['paragraphs'], num_paragraph = paragraph_to_dict_str(json_data['paragraphs'])
        json_data['num_of_paragraphs'] = num_paragraph
        json_data['summary_text'] = summary_to_text(json_data['summary'])
        json_data['summary'], num_summary = summary_to_dict_str(json_data['summary'])
        json_data['num_of_summary'] = num_summary

        new_json_list.append(json_data)
    
    return new_json_list

In [9]:
# Fungsi untuk membuat dataset dari JSON Lines
def create_dataset(jsonl):
    header = list(jsonl[0].keys())
    dataset_list = []
    for json_data in jsonl:
        row = []
        for h in header:
            row.append(json_data[h])
        dataset_list.append(row)
    
    return header, dataset_list

In [10]:
# Fungsi untuk membuat dataset dari file JSON Lines
def create_dataset_from_files(file_list):
    df_header = None
    dataset_list = []
    for filename in file_list:
        json_l = load_file_to_json_list(filename)
        new_json_l = alter_json_data(json_l, filename)
        header, dataset_part = create_dataset(new_json_l)
        
        if not df_header: df_header = header
        dataset_list.extend(dataset_part)
        
    df_full = pd.DataFrame().from_records(dataset_list)
    df_full = df_full.rename(columns=dict(enumerate(header)))
    return df_full

# Proses hanya data train
df_train = create_dataset_from_files(train_files)

# Tampilkan hasil
df_train.head()

Loading data train.01.jsonl: 100%|██████████| 14262/14262 [00:01<00:00, 7805.04it/s] 
Altering json data train.01.jsonl: 100%|██████████| 14262/14262 [00:00<00:00, 22208.83it/s]
Loading data train.02.jsonl: 100%|██████████| 14263/14263 [00:01<00:00, 7860.62it/s]
Altering json data train.02.jsonl: 100%|██████████| 14263/14263 [00:00<00:00, 23872.36it/s]
Loading data train.03.jsonl: 100%|██████████| 14290/14290 [00:01<00:00, 9653.09it/s]
Altering json data train.03.jsonl: 100%|██████████| 14290/14290 [00:00<00:00, 24526.75it/s]
Loading data train.04.jsonl: 100%|██████████| 14272/14272 [00:01<00:00, 9097.43it/s]
Altering json data train.04.jsonl: 100%|██████████| 14272/14272 [00:00<00:00, 23867.61it/s]
Loading data train.05.jsonl: 100%|██████████| 14266/14266 [00:01<00:00, 8652.25it/s]
Altering json data train.05.jsonl: 100%|██████████| 14266/14266 [00:00<00:00, 23261.00it/s]


Unnamed: 0,category,gold_labels,id,paragraphs,source,source_url,summary,news_text,num_of_paragraphs,summary_text,num_of_summary
0,tajuk utama,"{""0"": [false, true], ""1"": [true, true], ""2"": [...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,"{""0"": [""Jakarta , CNN Indonesia - - Dokter Rya...",cnn indonesia,https://www.cnnindonesia.com/hiburan/201708041...,"{""0"": ""Dokter Lula Kamal yang merupakan selebr...","Jakarta , CNN Indonesia - - Dokter Ryan Thamri...",9,Dokter Lula Kamal yang merupakan selebriti sek...,3
1,teknologi,"{""0"": [false, false, false, false], ""1"": [fals...",1509072914-dua-smartphone-zenfone-baru-tawarka...,"{""0"": [""Selfie ialah salah satu tema terpanas ...",dailysocial.id,https://dailysocial.id/post/dua-smartphone-zen...,"{""0"": ""Asus memperkenalkan \u00a0 ZenFone gene...",Selfie ialah salah satu tema terpanas di kalan...,14,Asus memperkenalkan ZenFone generasi keempat...,3
2,hiburan,"{""0"": [true], ""1"": [true], ""2"": [false, false]...",1510613677-songsong-visit-2020-bengkulu-perkua...,"{""0"": [""Jakarta , CNN Indonesia - - Dinas Pari...",cnn indonesia,https://www.cnnindonesia.com/gaya-hidup/201711...,"{""0"": ""Dinas Pariwisata Provinsi Bengkulu kemb...","Jakarta , CNN Indonesia - - Dinas Pariwisata P...",21,Dinas Pariwisata Provinsi Bengkulu kembali men...,2
3,tajuk utama,"{""0"": [true, true], ""1"": [false, false, false]...",1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,"{""0"": [""Merdeka.com - Indonesia Corruption Wat...",merdeka,https://www.merdeka.com/peristiwa/icw-merasa-a...,"{""0"": ""Indonesia Corruption Watch ( ICW ) memi...",Merdeka.com - Indonesia Corruption Watch ( ICW...,5,Indonesia Corruption Watch ( ICW ) meminta Kom...,2
4,tajuk utama,"{""0"": [false, true], ""1"": [true, true, true], ...",1503039338-pembagian-sepeda-usai-upacara-penur...,"{""0"": [""Merdeka.com - Presiden Joko Widodo ( J...",merdeka,https://www.merdeka.com/peristiwa/usai-upacara...,"{""0"": ""Jokowi memimpin upacara penurunan bende...",Merdeka.com - Presiden Joko Widodo ( Jokowi ) ...,7,Jokowi memimpin upacara penurunan bendera . Us...,5


In [11]:
# Load T5 tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
t5_model = T5Model.from_pretrained("panggi/t5-base-indonesian-summarization-cased").eval()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
# Preprocess text without NLTK and retain stop words
def preprocess_text(text):
    """Tokenize sentences using simple splitting and retain stop words."""
    text = text.replace('\n', ' ').replace('\r', ' ').strip()  # Remove newlines and strip extra spaces
    sentences = text.split('. ')
    sentences = [sentence.strip() for sentence in sentences if len(sentence.strip()) > 0]  # Include all valid sentences
    return sentences

# Build similarity matrix using TF-IDF
def build_similarity_matrix(sentences):
    """Build cosine similarity matrix for sentences using TF-IDF without removing stop words."""
    vectorizer = TfidfVectorizer()  # Do not use stop_words='english'
    tfidf_matrix = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

def lexrank(sentences, similarity_matrix, threshold=0.05):
    """Run LexRank algorithm to score sentences."""
    n = len(sentences)
    scores = np.ones(n) / n
    adjacency_matrix = (similarity_matrix > threshold).astype(float)
    row_sums = adjacency_matrix.sum(axis=1)
    row_sums[row_sums == 0] = 1  # Avoid division by zero
    adjacency_matrix = adjacency_matrix / row_sums[:, np.newaxis]
    for _ in range(150):  # Increase iterations for convergence
        scores = 0.9 * adjacency_matrix.T.dot(scores) + 0.1 / n
    return scores

def summarize_text(text, max_words=50):
    """Summarize text using LexRank."""
    sentences = preprocess_text(text)
    if len(sentences) == 0:
        return ""  # Return empty summary if no valid sentences
    similarity_matrix = build_similarity_matrix(sentences)
    scores = lexrank(sentences, similarity_matrix)
    ranked_sentences = [sentences[i] for i in np.argsort(scores)[::-1]]
    summary = []
    word_count = 0
    for sentence in ranked_sentences:
        word_count += len(sentence.split())
        if word_count > max_words:
            break
        summary.append(sentence)
    return " ".join(summary)

In [13]:
# Example usage
max_steps = 1000 # Limit number of rows to process
summary_generated = []


In [14]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [15]:
for i, row in tqdm(df_train[['id', 'news_text']].head(max_steps).iterrows(), total=max_steps):
    sg = summarize_text(row['news_text'], max_words=75)
    summary_generated.append([row['id'], sg])

# Konversi hasil menjadi DataFrame
df_summary_generated = pd.DataFrame(summary_generated, columns=['id', 'summary_generated'])

# Gabungkan dengan dataset asli
df_train_result = df_train.head(max_steps).merge(df_summary_generated, on='id')


100%|██████████| 1000/1000 [00:01<00:00, 640.24it/s]


In [16]:
# Evaluasi menggunakan ROUGE
rouge = evaluate.load('rouge')
results = rouge.compute(
    references=df_train_result['summary_text'].values,
    predictions=df_train_result['summary_generated'].values
)

print(results)

{'rouge1': 0.41267042289142525, 'rouge2': 0.2668166875418545, 'rougeL': 0.3096750581462644, 'rougeLsum': 0.3100984763102359}


In [17]:
# Tampilkan sampel hasil
df_train_result.head()


Unnamed: 0,category,gold_labels,id,paragraphs,source,source_url,summary,news_text,num_of_paragraphs,summary_text,num_of_summary,summary_generated
0,tajuk utama,"{""0"": [false, true], ""1"": [true, true], ""2"": [...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,"{""0"": [""Jakarta , CNN Indonesia - - Dokter Rya...",cnn indonesia,https://www.cnnindonesia.com/hiburan/201708041...,"{""0"": ""Dokter Lula Kamal yang merupakan selebr...","Jakarta , CNN Indonesia - - Dokter Ryan Thamri...",9,Dokter Lula Kamal yang merupakan selebriti sek...,3,Dia juga tak tahu penyakit apa yang diderita R...
1,teknologi,"{""0"": [false, false, false, false], ""1"": [fals...",1509072914-dua-smartphone-zenfone-baru-tawarka...,"{""0"": [""Selfie ialah salah satu tema terpanas ...",dailysocial.id,https://dailysocial.id/post/dua-smartphone-zen...,"{""0"": ""Asus memperkenalkan \u00a0 ZenFone gene...",Selfie ialah salah satu tema terpanas di kalan...,14,Asus memperkenalkan ZenFone generasi keempat...,3,Mereka adalah Asus ZenFone 4 Selfie Pro ZD552K...
2,hiburan,"{""0"": [true], ""1"": [true], ""2"": [false, false]...",1510613677-songsong-visit-2020-bengkulu-perkua...,"{""0"": [""Jakarta , CNN Indonesia - - Dinas Pari...",cnn indonesia,https://www.cnnindonesia.com/gaya-hidup/201711...,"{""0"": ""Dinas Pariwisata Provinsi Bengkulu kemb...","Jakarta , CNN Indonesia - - Dinas Pariwisata P...",21,Dinas Pariwisata Provinsi Bengkulu kembali men...,2,Deputi Pengembangan Pemasaran Pariwisata Nusan...
3,tajuk utama,"{""0"": [true, true], ""1"": [false, false, false]...",1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,"{""0"": [""Merdeka.com - Indonesia Corruption Wat...",merdeka,https://www.merdeka.com/peristiwa/icw-merasa-a...,"{""0"": ""Indonesia Corruption Watch ( ICW ) memi...",Merdeka.com - Indonesia Corruption Watch ( ICW...,5,Indonesia Corruption Watch ( ICW ) meminta Kom...,2,"Kenapa momentum meninggalnya , saat kasus e - ..."
4,tajuk utama,"{""0"": [false, true], ""1"": [true, true, true], ...",1503039338-pembagian-sepeda-usai-upacara-penur...,"{""0"": [""Merdeka.com - Presiden Joko Widodo ( J...",merdeka,https://www.merdeka.com/peristiwa/usai-upacara...,"{""0"": ""Jokowi memimpin upacara penurunan bende...",Merdeka.com - Presiden Joko Widodo ( Jokowi ) ...,7,Jokowi memimpin upacara penurunan bendera . Us...,5,"Usai prosesi penurunan bendera dilakukan , Jok..."
