In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import heapq
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [None]:
nltk.download(['punkt_tab', 'stopwords', 'wordnet', 'averaged_perceptron_tagger_eng'])
# 1 raz

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
def get_wordnet_pos(treebank_tag):
    """Mapowanie tagów POS z formatu Treebank do WordNet"""
    return {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }.get(treebank_tag[0], wordnet.NOUN)  # Domyślnie rzeczownik

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Dodatkowe czyszczenie specyficzne dla CNN/DailyMail
    text = re.sub(r'\(CNN\)|--|\u2014|\u2013', ' ', text)  # Usuwanie elementów typowych dla tego datasetu

    sentences = sent_tokenize(text)
    processed_sentences = []

    for sentence in sentences:
        # Czyszczenie tekstu
        sentence = re.sub(r'[^a-zA-Z\']', ' ', sentence)  # Zachowaj apostrofy w kontrakcjach
        sentence = re.sub(r'\s+', ' ', sentence).strip()

        # Tokenizacja i tagging POS
        words = word_tokenize(sentence.lower())
        pos_tags = nltk.pos_tag(words)

        # Lematyzacja z uwzględnieniem POS
        lemmatized_words = []
        for word, tag in pos_tags:
            if word not in stop_words and len(word) > 1:  # Ignoruj pojedyncze litery
                wn_tag = get_wordnet_pos(tag)
                lemma = lemmatizer.lemmatize(word, wn_tag)
                lemmatized_words.append(lemma)

        processed_sentences.append(lemmatized_words)

    return sentences, processed_sentences


In [None]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [None]:
# Przetwarzanie batchowe dla optymalizacji
def process_examples(batch):
    original, processed = [], []
    for text in batch['article']:
        orig_sent, proc_sent = preprocess_text(text)
        original.append(orig_sent)
        processed.append(proc_sent)
    return {'original_sentences': original, 'processed_sentences': processed}

# Przetwórz dataset (można dostosować batch_size)
ds = ds.map(process_examples, batched=True, batch_size=32)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
def textrank_summarize(text, top_n=5, damping_factor=0.85, similarity_threshold=0.1):
    original_sentences, processed_sentences = preprocess_text(text)
    n = len(original_sentences)

    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            if i != j:
                intersection = set(processed_sentences[i]) & set(processed_sentences[j])
                denominator = np.log(len(processed_sentences[i])) + np.log(len(processed_sentences[j]))
                similarity_matrix[i][j] = len(intersection) / denominator if denominator != 0 else 0

    # Budowa grafu
    graph = nx.from_numpy_array(similarity_matrix)

    # Usuwanie słabych połączeń
    for u, v, w in graph.edges(data='weight'):
        if w < similarity_threshold:
            graph.remove_edge(u, v)

    # Obliczanie wyników PageRank
    scores = nx.pagerank(graph, alpha=damping_factor)

    # Wybór najlepszych zdań
    ranked_sentences = heapq.nlargest(top_n, ((scores[i], i) for i in range(n)))

    # Sortowanie według kolejności w oryginalnym tekście
    selected_indices = sorted([idx for _, idx in ranked_sentences])
    summary = [original_sentences[idx] for idx in selected_indices]

    return ' '.join(summary)

In [None]:
examples=ds['train'].select(range(50))

In [None]:
# Ładowanie i przetwarzanie danych
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:50]")

In [None]:
example

{'highlights': "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund .",
 'generated_summary': 'LONDON, England (Reuters)   Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. Radcliffe\'s earnings from the first five Potter films have been held in a trust fun

In [None]:
def generate_summaries(batch):
    summaries = []
    for article in batch["article"]:
        summary = textrank_summarize(article)
        summaries.append(summary)
    return {"generated_summary": summaries}

# Generowanie streszczeń
dataset_with_summaries = dataset.map(
    generate_summaries,
    batched=True,
    batch_size=8,
    remove_columns=["id"]
)

# Wyświetlanie wyników
for i, example in enumerate(dataset_with_summaries):
    print(f"\nArtykuł {i+1}:")
    print("\n Ilość znaków w oryginale: ", len(example["article"]))
    print("Oryginalne podsumowanie:", example["highlights"])
    print("\n Ilość znaków: ", len(example["highlights"]))
    print("Wygenerowane podsumowanie:", example["generated_summary"])
    print("\n Ilość znaków: ", len(example["generated_summary"]))

    print("="*100)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  denominator = np.log(len(processed_sentences[i])) + np.log(len(processed_sentences[j]))



Artykuł 1:

 Ilość znaków w oryginale:  2527
Oryginalne podsumowanie: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .

 Ilość znaków:  217
Wygenerowane podsumowanie: LONDON, England (Reuters)   Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. Radcliffe'