In [1]:
import numpy as np
import networkx as nx
import heapq
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_text(text):
    # Usuwanie specjalnych znaczników
    text = text.replace("(CNN)", "").replace("--", "").replace("''", '"')
    
    # Podział na zdania
    original_sentences = sent_tokenize(text)
    
    # Czyszczenie zdań
    stop_words = set(stopwords.words("english"))
    processed_sentences = []
    for sent in original_sentences:
        words = [word.lower() for word in word_tokenize(sent) if word.isalnum() and word.lower() not in stop_words]
        processed_sentences.append(" ".join(words))
    
    return original_sentences, processed_sentences

In [3]:
def textrank_summarize(article, top_n=5, damping_factor=0.85, similarity_threshold=0.1):
    # Jedno przetworzenie tekstu
    original_sentences, processed_sentences = preprocess_text(article)
    n = len(original_sentences)
    
    if n == 0:
        return ""
    
    # Macierz podobieństw (zoptymalizowana wersja)
    similarity_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i != j:
                set_i = set(processed_sentences[i].split())
                set_j = set(processed_sentences[j].split())
                intersection = len(set_i & set_j)
                denominator = np.log(len(set_i) + 1e-8) + np.log(len(set_j) + 1e-8)  # Zabezpieczenie przed log(0)
                similarity_matrix[i][j] = intersection / denominator if denominator != 0 else 0
                
    # Graf i PageRank
    graph = nx.from_numpy_array(similarity_matrix)
    graph.remove_edges_from([(u, v) for u, v, w in graph.edges(data="weight") if w < similarity_threshold])
    scores = nx.pagerank(graph, alpha=damping_factor)
    
    # Wybór i sortowanie zdań
    ranked_indices = sorted(
        heapq.nlargest(top_n, range(n), key=lambda i: scores[i]),
        key=lambda x: x  # Zachowaj kolejność oryginalną
    )
    
    return " ".join([original_sentences[i] for i in ranked_indices])

In [5]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:10]")

In [6]:
dataset_with_summaries = dataset.map(
        lambda batch: {
            "generated_summary": [textrank_summarize(article) for article in batch["article"]]
        },
        batched=True,
        batch_size=8  # Zwiększ dla lepszej wydajności
    )
    
    # Wyświetl wyniki
for i, example in enumerate(dataset_with_summaries):
    print(f"\nArtykuł {i+1}:")
    print("Oryginalne podsumowanie:", example["highlights"])
    print("Wygenerowane podsumowanie:", example["generated_summary"])
    print("=" * 100)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map: 100%|██████████| 10/10 [00:00<00:00, 72.45 examples/s]


Artykuł 1:
Oryginalne podsumowanie: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .
Wygenerowane podsumowanie: LONDON, England (Reuters)  Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. At 18, Radcliffe will be able to gamble in a casino, buy a drink i


