In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import heapq
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [2]:
nltk.download(['punkt_tab', 'stopwords', 'wordnet', 'averaged_perceptron_tagger_eng'])
# 1 raz

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [3]:
def get_wordnet_pos(treebank_tag):
    """Mapowanie tagów POS z formatu Treebank do WordNet"""
    return {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }.get(treebank_tag[0], wordnet.NOUN)  # Domyślnie rzeczownik

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Dodatkowe czyszczenie specyficzne dla CNN/DailyMail
    text = re.sub(r'\(CNN\)|--|\u2014|\u2013', ' ', text)  # Usuwanie elementów typowych dla tego datasetu

    sentences = sent_tokenize(text)
    processed_sentences = []

    for sentence in sentences:
        # Czyszczenie tekstu
        sentence = re.sub(r'[^a-zA-Z\']', ' ', sentence)  # Zachowaj apostrofy w kontrakcjach
        sentence = re.sub(r'\s+', ' ', sentence).strip()

        # Tokenizacja i tagging POS
        words = word_tokenize(sentence.lower())
        pos_tags = nltk.pos_tag(words)

        # Lematyzacja z uwzględnieniem POS
        lemmatized_words = []
        for word, tag in pos_tags:
            if word not in stop_words and len(word) > 1:  # Ignoruj pojedyncze litery
                wn_tag = get_wordnet_pos(tag)
                lemma = lemmatizer.lemmatize(word, wn_tag)
                lemmatized_words.append(lemma)

        processed_sentences.append(lemmatized_words)

    return sentences, processed_sentences


In [4]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 287113/287113 [00:01<00:00, 147321.21 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 151115.54 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 172377.29 examples/s]


In [5]:
# Przetwarzanie batchowe dla optymalizacji
def process_examples(batch):
    original, processed = [], []
    for text in batch['article']:
        orig_sent, proc_sent = preprocess_text(text)
        original.append(orig_sent)
        processed.append(proc_sent)
    return {'original_sentences': original, 'processed_sentences': processed}

# Przetwórz dataset (można dostosować batch_size)
ds = ds.map(process_examples, batched=True, batch_size=32)

Map: 100%|██████████| 287113/287113 [1:41:56<00:00, 46.94 examples/s]
Map: 100%|██████████| 13368/13368 [04:40<00:00, 47.61 examples/s]
Map: 100%|██████████| 11490/11490 [04:02<00:00, 47.39 examples/s]


In [9]:
def textrank_summarize(text, top_n=5, damping_factor=0.85, similarity_threshold=0.1):
    original_sentences, processed_sentences = preprocess_text(text)
    n = len(original_sentences)

    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            if i != j:
                intersection = set(processed_sentences[i]) & set(processed_sentences[j])
                denominator = np.log(len(processed_sentences[i])) + np.log(len(processed_sentences[j]))
                similarity_matrix[i][j] = len(intersection) / denominator if denominator != 0 else 0

    # Budowa grafu
    graph = nx.from_numpy_array(similarity_matrix)

    # Usuwanie słabych połączeń
    for u, v, w in graph.edges(data='weight'):
        if w < similarity_threshold:
            graph.remove_edge(u, v)

    # Obliczanie wyników PageRank
    scores = nx.pagerank(graph, alpha=damping_factor)

    # Wybór najlepszych zdań
    ranked_sentences = heapq.nlargest(top_n, ((scores[i], i) for i in range(n)))

    # Sortowanie według kolejności w oryginalnym tekście
    selected_indices = sorted([idx for _, idx in ranked_sentences])
    summary = [original_sentences[idx] for idx in selected_indices]

    return ' '.join(summary)

In [10]:
examples=ds['train'].select(range(50))

In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'original_sentences', 'processed_sentences'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'original_sentences', 'processed_sentences'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'original_sentences', 'processed_sentences'],
        num_rows: 11490
    })
})

In [13]:
def generate_summaries(batch):
    summaries = []
    for article in batch["article"]:
        summary = textrank_summarize(article)
        summaries.append(summary)
    return {"generated_summary": summaries}

# Generowanie streszczeń
dataset_with_summaries = ds.map(
    generate_summaries,
    batched=True,
    batch_size=8,
    remove_columns=["id"]
)

# Wyświetlanie wyników
for i, example in enumerate(dataset_with_summaries):
    print(f"\nArtykuł {i+1}:")
    print("\n Ilość znaków w oryginale: ", len(example["article"]))
    print("Oryginalne podsumowanie:", example["highlights"])
    print("\n Ilość znaków: ", len(example["highlights"]))
    print("Wygenerowane podsumowanie:", example["generated_summary"])
    print("\n Ilość znaków: ", len(example["generated_summary"]))

    print("="*100)

  denominator = np.log(len(processed_sentences[i])) + np.log(len(processed_sentences[j]))
Map:   0%|          | 472/287113 [00:10<1:50:05, 43.39 examples/s]


KeyboardInterrupt: 