In [None]:
import numpy as np
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from rouge import Rouge
import spacy 



In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
nlp = spacy.load("en_core_web_sm")

In [None]:
def preprocess_text(text):
    # Usuń znaczniki specjalne (np. "(CNN)", "--", "''")
    text = text.replace("(CNN)", "").replace("--", "").replace("''", '"')
    
    # Podział na zdania
    sentences = sent_tokenize(text)
    
    # Czyszczenie zdań
    stop_words = set(stopwords.words("english"))
    preprocessed = []
    for sent in sentences:
        words = word_tokenize(sent.lower())
        words = [word for word in words if word.isalnum() and word not in stop_words]
        preprocessed.append(" ".join(words))
    
    return preprocessed, sentences  # Zwraca zdania przetworzone i oryginalne

In [None]:
def textrank_cnn_dailymail(article, num_sentences=5):
    preprocessed, original_sentences = preprocess_text(article)
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed)
    sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    np.fill_diagonal(sim_matrix, 0)
    graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(graph, max_iter=1000, tol=0.0001)
    
    
    ranked_sentences = sorted(((scores[i], i) for i in range(len(original_sentences))), reverse=True)
    selected_indices = [ranked_sentences[i][1] for i in range(min(num_sentences, len(ranked_sentences)))]
    selected_indices.sort()
    
    return " ".join([original_sentences[i] for i in selected_indices])

In [18]:
podsumowanie = textrank_cnn_dailymail(przykladowy_artykul, num_sentences=5)
print("Wygenerowane podsumowanie:\n", podsumowanie)
print("\nDocelowy highlight:\n", przykladowy_highlight)

Wygenerowane podsumowanie:
 LONDON, England (Reuters)  Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: "I just think I'm going to be more sort of fair 

In [19]:
# Wczytaj dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Przetwórz 10 artykułów i porównaj z highlights
for i in range(10):
    article = dataset["train"][i]["article"]
    highlight = dataset["train"][i]["highlights"]
    summary = textrank_cnn_dailymail(article, num_sentences=5)
    print(f"Artykuł {i+1}:")
    print("Podsumowanie:", summary)
    print("Highlight:", highlight)
    print("\n" + "-"*50 + "\n")

Artykuł 1:
Podsumowanie: LONDON, England (Reuters)  Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: "I just think I'm going to be more sort of fair gam

In [20]:
summary

'  Police and FBI agents are investigating the discovery of an empty rocket launcher tube on the front lawn of a Jersey City, New Jersey, home, FBI spokesman Sean Quinn said. Niranjan Desai discovered the 20-year-old AT4 anti-tank rocket launcher tube, a one-time-use device, lying on her lawn Friday morning, police said. The launcher has been turned over to U.S. Army officials at the 754th Ordnance Company, an explosive ordnance disposal unit, at Fort Monmouth, New Jersey, Army officials said. Army officials said they could not determine if the launcher had been fired, but indicated they should know once they find out where it came from. An Army official said the device is basically a shoulder-fired, direct-fire weapon used against ground targets  a modern-day bazooka  and it is not wire-guided.'

In [21]:
rouge = Rouge()

wyniki = rouge.get_scores(podsumowanie, przykladowy_highlight)
print("ROUGE-L:", wyniki[0]["rouge-l"]["f"])

ROUGE-L: 0.25766870821483684
