In [1]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [18]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
from rouge_score import rouge_scorer

# Load pre-trained model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

def summarize_text(text, max_length=150):
    # Prepare the text for the model
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

    # Generate summary
    summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

def calculate_rouge_scores(references, summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for ref, summ in zip(references, summaries):
        score = scorer.score(ref, summ)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)

    # Calculate average scores
    avg_scores = {metric: sum(values) / len(values) for metric, values in scores.items()}
    return avg_scores

# Membaca file Excel
excel_file_path = 'text-test.xlsx'  # Ganti dengan path file Anda
df_excel = pd.read_excel(excel_file_path)

# Validasi kolom
if 'article' not in df_excel.columns or 'highlights' not in df_excel.columns:
    raise ValueError("Kolom 'article' atau 'highlights' tidak ditemukan dalam dataset")

# Validasi data tidak kosong
if df_excel['article'].isnull().all() or df_excel['highlights'].isnull().all():
    raise ValueError("Kolom 'article' atau 'highlights' tidak memiliki data yang valid")

# Meringkas teks dalam dataset
summaries = []
for i in range(998):  # mengambil 500 artikel pertama sebagai contoh
    article = df_excel['article'].iloc[i]
    summary = summarize_text(article)
    summaries.append(summary)
    print(f"Original text:\n{article}\n")
    print(f"Summary:\n{summary}\n\n")

# Menghitung skor ROUGE
references = df_excel['highlights'].iloc[:500].tolist()
rouge_scores = calculate_rouge_scores(references, summaries)

print("ROUGE scores:", rouge_scores)

# Menyimpan hasil ke file baru
df_excel['generated_summary'] = ""
df_excel['generated_summary'].iloc[:998] = summaries
df_excel.to_csv('summarized_text_with_scores.csv', index=False)

print("Ringkasan teks dan skor ROUGE telah disimpan ke summarized_text_with_scores.csv")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Original text:
During a recent episode of Divorce Court, a disgruntled man said the members of Wu-Tang Clan did something with his girlfriend - and it wasn't unleashing their famed Killa Beez. Nathan Sellers went on the show with his ex, Lia Palmquist, and accused her of sleeping with every member of the Wu-Tang Clan during one night as she partied with them at a hotel after a show. That would mean Palmquist had sex with the RZA, the GZA, Ghostface Killah, Masta Killa, U-God, Inspectah Deck, Raekwon, Cappadonna and Method Man. The tenth member of the Wu, Ol' Dirty Bastard, is dead so he wasn't involved no matter what. Nathan Sellers (right) went on Divorce Court with Lia Palmquist (left) and said she slept with the Wu-Tang Clan . American rap group Wu-Tang Clan (L - R) Ghostface Killah, Masta Killa, Raekwon, RZA, Ol' Dirty Bastard, GZA, U-God and Method Man pose for a April 1997 portrait in New York City, New York. ODB is