In [5]:
import pandas as pd
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
import random
import torch
from tqdm import tqdm
import evaluate

sns.set_style('ticks')

# Path dataset (sesuaikan dengan lokasi dataset Anda di Colab)
DATASET_ROOT = './indosum'

# Buat folder jika dataset belum ada
if not os.path.exists(DATASET_ROOT):
    os.makedirs(DATASET_ROOT)

# Pastikan file dataset diunggah ke folder ini sebelum menjalankan kode
files_id_dir = os.listdir(DATASET_ROOT)
train_files = []

for filename in files_id_dir:
    if 'train' in filename:
        train_files.append(filename)
        
# Fungsi untuk memuat data JSON Lines
def load_file_to_json_list(filename):
    file = os.path.join(DATASET_ROOT, filename)

    data = []
    with open(file, 'r') as f:
        # Read the entire file content
        file_content = f.read()
        
        # Split the content into individual JSON objects
        json_list = file_content.splitlines() 
        
        for json_str in tqdm(json_list, desc=f'Loading data {filename}'):
            # Skip empty lines
            if json_str.strip(): 
                try:
                    d = json.loads(json_str)
                    data.append(d)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    print(f"Problematic JSON string: {json_str}")
                    # You might want to handle the error, e.g., skip the line or try to fix the JSON
                    
    return data

# Fungsi untuk memproses label menjadi string JSON
def label_to_dict_str(label_list):
    label_dict = {}  # key = paragraph_id : value = label list 
    for i, label in enumerate(label_list[:]):
        label_dict[i] = label

    json_str = json.dumps(label_dict)
    num = len(label_dict)
    return json_str, num

# Fungsi untuk memproses paragraph menjadi string JSON
def paragraph_to_dict_str(paragraph_list):
    paragraph_dict = {}  # key = paragraph_id : value = paragraph list 
    for i, paragraph in enumerate(paragraph_list):
        new_paragraph = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            new_paragraph.append(sentence)
        paragraph_dict[i] = new_paragraph

    json_str = json.dumps(paragraph_dict)
    num = len(paragraph_dict)
    return json_str, num
    
# Fungsi untuk mengubah paragraf menjadi string teks
def paragraph_to_text(raw_paragraph_list):
    new_paragraph_list = []
    for i, paragraph in enumerate(raw_paragraph_list):
        paragraph_list = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            paragraph_list.append(sentence)

        new_paragraph = ' '.join(paragraph_list)
        new_paragraph_list.append(new_paragraph)

    paragraph_str = ' '.join(new_paragraph_list)
    return paragraph_str

# Fungsi untuk memproses summary menjadi string JSON
def summary_to_dict_str(summary_list):
    summary_dict = {}  # key = summary_id : value = summary sentence 
    for i, summary in enumerate(summary_list):
        summary_dict[i] = ' '.join(summary)

    json_str = json.dumps(summary_dict)
    num = len(summary_dict)
    return json_str, num
# Fungsi untuk mengubah summary menjadi string teks
def summary_to_text(raw_summary_list):
    summary_list = []
    for i, summary in enumerate(raw_summary_list):
        summary_list.append(' '.join(summary))

    summary_str = ' '.join(summary_list)
    return summary_str

# Fungsi untuk mengubah data JSON
def alter_json_data(json_list_data, filename=''):
    new_json_list = []
    for json_data in tqdm(json_list_data, desc=f'Altering json data {filename}'):
        json_data = json_data.copy()
        json_data['gold_labels'], _ = label_to_dict_str(json_data['gold_labels'])
        json_data['news_text'] = paragraph_to_text(json_data['paragraphs'])
        json_data['paragraphs'], num_paragraph = paragraph_to_dict_str(json_data['paragraphs'])
        json_data['num_of_paragraphs'] = num_paragraph
        json_data['summary_text'] = summary_to_text(json_data['summary'])
        json_data['summary'], num_summary = summary_to_dict_str(json_data['summary'])
        json_data['num_of_summary'] = num_summary

        new_json_list.append(json_data)
    
    return new_json_list

# Fungsi untuk membuat dataset dari JSON Lines
def create_dataset(jsonl):
    header = list(jsonl[0].keys())
    dataset_list = []
    for json_data in jsonl:
        row = []
        for h in header:
            row.append(json_data[h])
        dataset_list.append(row)
    
    return header, dataset_list

# Fungsi untuk membuat dataset dari file JSON Lines
def create_dataset_from_files(file_list):
    df_header = None
    dataset_list = []
    for filename in file_list:
        json_l = load_file_to_json_list(filename)
        new_json_l = alter_json_data(json_l, filename)
        header, dataset_part = create_dataset(new_json_l)
        
        if not df_header: df_header = header
        dataset_list.extend(dataset_part)
        
    df_full = pd.DataFrame().from_records(dataset_list)
    df_full = df_full.rename(columns=dict(enumerate(header)))
    return df_full

# Proses hanya data train
df_train = create_dataset_from_files(train_files)

# Tampilkan hasil
df_train.head()

Loading data train.01.jsonl: 100%|██████████| 14262/14262 [00:01<00:00, 8763.24it/s]
Altering json data train.01.jsonl: 100%|██████████| 14262/14262 [00:00<00:00, 24804.02it/s]
Loading data train.02.jsonl: 100%|██████████| 14263/14263 [00:01<00:00, 9224.46it/s]
Altering json data train.02.jsonl: 100%|██████████| 14263/14263 [00:00<00:00, 24467.11it/s]
Loading data train.03.jsonl: 100%|██████████| 14290/14290 [00:01<00:00, 9384.29it/s] 
Altering json data train.03.jsonl: 100%|██████████| 14290/14290 [00:00<00:00, 24222.53it/s]
Loading data train.04.jsonl: 100%|██████████| 14272/14272 [00:01<00:00, 9010.06it/s] 
Altering json data train.04.jsonl: 100%|██████████| 14272/14272 [00:00<00:00, 24827.76it/s]
Loading data train.05.jsonl: 100%|██████████| 14266/14266 [00:01<00:00, 8442.11it/s]
Altering json data train.05.jsonl: 100%|██████████| 14266/14266 [00:00<00:00, 24156.20it/s]


Unnamed: 0,category,gold_labels,id,paragraphs,source,source_url,summary,news_text,num_of_paragraphs,summary_text,num_of_summary
0,tajuk utama,"{""0"": [false, true], ""1"": [true, true], ""2"": [...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,"{""0"": [""Jakarta , CNN Indonesia - - Dokter Rya...",cnn indonesia,https://www.cnnindonesia.com/hiburan/201708041...,"{""0"": ""Dokter Lula Kamal yang merupakan selebr...","Jakarta , CNN Indonesia - - Dokter Ryan Thamri...",9,Dokter Lula Kamal yang merupakan selebriti sek...,3
1,teknologi,"{""0"": [false, false, false, false], ""1"": [fals...",1509072914-dua-smartphone-zenfone-baru-tawarka...,"{""0"": [""Selfie ialah salah satu tema terpanas ...",dailysocial.id,https://dailysocial.id/post/dua-smartphone-zen...,"{""0"": ""Asus memperkenalkan \u00a0 ZenFone gene...",Selfie ialah salah satu tema terpanas di kalan...,14,Asus memperkenalkan ZenFone generasi keempat...,3
2,hiburan,"{""0"": [true], ""1"": [true], ""2"": [false, false]...",1510613677-songsong-visit-2020-bengkulu-perkua...,"{""0"": [""Jakarta , CNN Indonesia - - Dinas Pari...",cnn indonesia,https://www.cnnindonesia.com/gaya-hidup/201711...,"{""0"": ""Dinas Pariwisata Provinsi Bengkulu kemb...","Jakarta , CNN Indonesia - - Dinas Pariwisata P...",21,Dinas Pariwisata Provinsi Bengkulu kembali men...,2
3,tajuk utama,"{""0"": [true, true], ""1"": [false, false, false]...",1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,"{""0"": [""Merdeka.com - Indonesia Corruption Wat...",merdeka,https://www.merdeka.com/peristiwa/icw-merasa-a...,"{""0"": ""Indonesia Corruption Watch ( ICW ) memi...",Merdeka.com - Indonesia Corruption Watch ( ICW...,5,Indonesia Corruption Watch ( ICW ) meminta Kom...,2
4,tajuk utama,"{""0"": [false, true], ""1"": [true, true, true], ...",1503039338-pembagian-sepeda-usai-upacara-penur...,"{""0"": [""Merdeka.com - Presiden Joko Widodo ( J...",merdeka,https://www.merdeka.com/peristiwa/usai-upacara...,"{""0"": ""Jokowi memimpin upacara penurunan bende...",Merdeka.com - Presiden Joko Widodo ( Jokowi ) ...,7,Jokowi memimpin upacara penurunan bendera . Us...,5


In [2]:
! pip install deap


Collecting deap
  Downloading deap-1.4.1.tar.gz (1.1 MB)
     ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
     ----------------------------- ---------- 0.8/1.1 MB 6.7 MB/s eta 0:00:01
     ---------------------------------------- 1.1/1.1 MB 5.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: deap
  Building wheel for deap (setup.py): started
  Building wheel for deap (setup.py): finished with status 'done'
  Created wheel for deap: filename=deap-1.4.1-cp311-cp311-win_amd64.whl size=108753 sha256=71b1428c01dc0b139acee59269509663b40943a28ebd551fae1e9fe92e008871
  Stored in directory: c:\users\arief m\appdata\local\pip\cache\wheels\f8\64\b8\65eacfbff3024ae2e2beb22e691d5c8abb89fbd863b8049b5f
Successfully built deap
Installing collected packages: deap
Successfully installed deap-1.4.1




In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import torch
import pandas as pd
from deap import base, creator, tools, algorithms
import numpy as np
from evaluate import load

# Load model dan tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
t5_model = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model = t5_model.to(device)

# Fungsi untuk menghasilkan ringkasan
def generate_summary(article, max_length=150, num_beams=4, repetition_penalty=2.5, length_penalty=1.0):
    input_ids = t5_tokenizer.encode(article, return_tensors='pt', truncation=True, max_length=512).to(device)
    summary_ids = t5_model.generate(
        input_ids,
        max_length=int(max_length),
        num_beams=int(num_beams),
        repetition_penalty=repetition_penalty,
        length_penalty=length_penalty,
        early_stopping=True,
        no_repeat_ngram_size=3,
        use_cache=True
    )
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# **Dataset tetap dari kode awal**
max_length = df_train['summary_text'].str.len().max()
max_steps = 100  # Batasi data
df_train_sample = df_train.head(max_steps)

# Fungsi Evaluasi dengan ROUGE
rouge = load('rouge')

def evaluate_summary(params):
    max_length, num_beams, repetition_penalty, length_penalty = params
    references = df_train_sample['summary_text'].values
    predictions = []
    for _, row in df_train_sample.iterrows():
        try:
            pred = generate_summary(
                row['news_text'],
                max_length=max_length,
                num_beams=num_beams,
                repetition_penalty=repetition_penalty,
                length_penalty=length_penalty
            )
        except Exception as e:
            pred = ""
        predictions.append(pred)
    
    # Evaluasi ROUGE
    results = rouge.compute(references=references, predictions=predictions)
    return results['rouge1']  # Skor ROUGE-1 digunakan sebagai fitness function

# **Genetic Algorithm Setup**
creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximizing ROUGE
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_max_length", np.random.uniform, 50, 150)  # Batasan panjang maksimum ringkasan
toolbox.register("attr_num_beams", np.random.randint, 2, 8)             # Beam search
toolbox.register("attr_repetition_penalty", np.random.uniform, 1.0, 3.0)  # Penalti pengulangan
toolbox.register("attr_length_penalty", np.random.uniform, 0.5, 2.0)      # Penalti panjang

toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_max_length, toolbox.attr_num_beams,
                  toolbox.attr_repetition_penalty, toolbox.attr_length_penalty), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate_summary)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.5, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# Jalankan GA
population = toolbox.population(n=10)  # Populasi awal
NGEN = 5  # Jumlah generasi
CXPB, MUTPB = 0.5, 0.2  # Probabilitas crossover dan mutasi

print("Mulai Genetic Algorithm...")
for gen in range(NGEN):
    print(f"-- Generasi {gen + 1} --")
    # Evaluasi semua individu
    fitnesses = list(map(toolbox.evaluate, population))
    for ind, fit in zip(population, fitnesses):
        ind.fitness.values = (fit,)
    
    # Seleksi
    offspring = toolbox.select(population, len(population))
    offspring = list(map(toolbox.clone, offspring))

    # Crossover dan Mutasi
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if np.random.rand() < CXPB:
            toolbox.mate(child1, child2)
            del child1.fitness.values, child2.fitness.values

    for mutant in offspring:
        if np.random.rand() < MUTPB:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    # Evaluasi ulang individu dengan fitness kosong
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = (fit,)

    # Ganti populasi dengan offspring baru
    population[:] = offspring

# Ambil hasil terbaik
best_ind = tools.selBest(population, 1)[0]
print(f"Individu terbaik: {best_ind}")
print(f"Skor ROUGE terbaik: {best_ind.fitness.values[0]}")




Mulai Genetic Algorithm...
-- Generasi 1 --
-- Generasi 2 --
-- Generasi 3 --
-- Generasi 4 --
-- Generasi 5 --
Individu terbaik: [108.42348340677029, 4.0942440229413855, 1.8424340179116698, 1.6747165278030955]
Skor ROUGE terbaik: 0.7556036429596136
