## 1. Librerías e inicialización

In [2]:
# Importar las librerías necesarias
import pandas as pd
import re
import nltk
import numpy as np
import networkx as nx
import fitz  # PyMuPDF para lectura de PDF
from typing import List
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from summarizer import Summarizer  # bert-extractive-summarizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Descargar recursos necesarios para NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Overglitch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Overglitch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Overglitch\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 2. Clase: DocumentReader

In [3]:
class DocumentReader:
    """Clase para leer documentos PDF y extraer texto plano."""

    def __init__(self, file_path: str):
        self.file_path = file_path

    def read_document(self) -> str:
        """Lee y extrae texto de un archivo PDF usando PyMuPDF."""
        text = ""
        try:
            with fitz.open(self.file_path) as doc:
                for page in doc:
                    text += page.get_text()  # Extrae texto de cada página
        except Exception as e:
            raise ValueError(f"Error leyendo el archivo PDF: {e}")
        return text


## 3. Clase: Preprocessor

In [4]:
class Preprocessor:
    """Clase para preprocesar texto: limpieza, tokenización y filtrado."""

    def __init__(self, language: str = 'english'):
        self.stopwords = nltk.corpus.stopwords.words(language)
        self.stemmer = SnowballStemmer(language)
        self.lemmatizer = WordNetLemmatizer()

    @staticmethod
    def tokenize_sentences(text: str) -> List[str]:
        """Tokeniza texto en oraciones."""
        return sent_tokenize(text)

    def preprocess_sentences(self, sentences: List[str]) -> List[str]:
        """
        Limpia y preprocesa oraciones eliminando ruido.
        - Filtra oraciones con muchos números o palabras irrelevantes.
        - Aplica minúsculas, stemming y lematización.
        """
        preprocessed = []
        for sentence in sentences:
            # Filtra oraciones con alta densidad de números
            if sum(char.isdigit() for char in sentence) / max(len(sentence), 1) > 0.3:
                continue

            # Conversión a minúsculas
            sentence = sentence.lower()
            # Eliminación de caracteres no alfanuméricos
            sentence = re.sub(r'[^a-zA-Z0-9\s]', '', sentence)
            # Tokenización y procesamiento de palabras
            words = word_tokenize(sentence)
            words = [
                self.lemmatizer.lemmatize(self.stemmer.stem(word))
                for word in words if word not in self.stopwords
            ]
            if words:  # Excluir oraciones vacías
                preprocessed.append(' '.join(words))
        return preprocessed


## 4. Clases de Resumen (TF-IDF, TextRank, TF-IDF+TextRank, BERT)

In [5]:
class TFIDFSummarizer:
    """Genera resúmenes usando el modelo TF-IDF."""

    @staticmethod
    def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
        """
        Genera un resumen basado en TF-IDF seleccionando las oraciones mejor puntuadas.
        """
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
        sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)
        ranked_indices = np.argsort(sentence_scores)[::-1]
        selected = [sentences[i] for i in ranked_indices[:num_sentences]]
        return ' '.join(selected)


class TextRankSummarizer:
    """Genera resúmenes usando el algoritmo TextRank."""

    @staticmethod
    def summarize(sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
        """
        Genera un resumen usando el algoritmo de grafos TextRank.
        """
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
        similarity_matrix = cosine_similarity(tfidf_matrix)
        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)
        ranked_indices = sorted(((scores[node], node) for node in nx_graph.nodes), reverse=True)
        selected = [sentences[i] for _, i in ranked_indices[:num_sentences]]
        return ' '.join(selected)


class CombinedSummarizer:
    """Genera resúmenes combinando palabras clave TF-IDF y TextRank."""

    def __init__(self, top_n_keywords: int = 10):
        self.top_n_keywords = top_n_keywords

    def extract_keywords_tfidf(self, preprocessed_sentences: List[str]) -> List[str]:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
        tfidf_scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray().sum(axis=0))
        sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
        return [word for word, _ in sorted_scores[:self.top_n_keywords]]

    def extract_keywords_textrank(self, preprocessed_sentences: List[str]) -> List[str]:
        words = ' '.join(preprocessed_sentences).split()
        co_occurrence_graph = nx.Graph()
        for i in range(len(words) - 1):
            word_pair = (words[i], words[i + 1])
            if co_occurrence_graph.has_edge(*word_pair):
                co_occurrence_graph[word_pair[0]][word_pair[1]]['weight'] += 1
            else:
                co_occurrence_graph.add_edge(word_pair[0], word_pair[1], weight=1)
        ranks = nx.pagerank(co_occurrence_graph, weight='weight')
        sorted_ranks = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
        return [word for word, _ in sorted_ranks[:self.top_n_keywords]]

    def combined_keywords(self, preprocessed_sentences: List[str]) -> List[str]:
        tfidf_keywords = self.extract_keywords_tfidf(preprocessed_sentences)
        textrank_keywords = self.extract_keywords_textrank(preprocessed_sentences)
        return list(set(tfidf_keywords) & set(textrank_keywords))

    def summarize(self, sentences: List[str], preprocessed_sentences: List[str], num_sentences: int = 1) -> str:
        keywords = self.combined_keywords(preprocessed_sentences)
        sentence_scores = []
        for i, sentence in enumerate(preprocessed_sentences):
            score = sum(1 for word in sentence.split() if word in keywords)
            sentence_scores.append((score, i))
        ranked_sentences = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
        selected = [sentences[i] for _, i in ranked_sentences[:num_sentences]]
        return ' '.join(selected)


class BERTSummarizer:
    """Genera resúmenes usando un modelo BERT extractivo preentrenado."""

    def __init__(self):
        self.model = Summarizer()

    def summarize(self, text: str, num_sentences: int = 1) -> str:
        return ''.join(self.model(text, num_sentences=num_sentences))


## 5. Evaluación de ROUGE

In [17]:
def calculate_rouge_scores(reference_summary: str, generated_summary: str) -> dict:
    """Calcula métricas ROUGE entre un resumen de referencia y el generado."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure
    }


## 6. Ejecución

In [25]:
# Parámetros iniciales
file_path = 'resources/f5df9a1942d2e626c5448416a05ffc9ca2d2369c.txt'  # Ruta del archivo
reference_summary = """
    Deputy police commissioner Nick Kaldas is giving evidence at an inquiry . Kaldas, 57, is a counter terrorism expert who has trained Iraqi police . He arrived in Australia aged 12 and fluent in English, French and Arabic . The inquiry is into a illegal police bugging operation of 114 people in 2000 . Kaldas is the highest ranking officer secretly bugged by his rival Kath Burn . He has 'explosive' evidence about bugging which has 'denigrated' his career . He has suffered reprisals for speaking out about the bugging scandal . The bugging operation threatens to blow apart NSW police hierarchy . He said independent inquiry into bugging scandal has left him fearful . Claimed Operation Prospect had sided with the officers being complained about and targeted him and other victims .
    """
num_sentences = 1

# Lectura y preprocesamiento
reader = DocumentReader(file_path)
text = reader.read_document()

preprocessor = Preprocessor()
sentences = preprocessor.tokenize_sentences(text)
preprocessed_sentences = preprocessor.preprocess_sentences(sentences)

# Generar resúmenes
tfidf_summarizer = TFIDFSummarizer()
tfidf_summary = tfidf_summarizer.summarize(sentences, preprocessed_sentences, num_sentences)

textrank_summarizer = TextRankSummarizer()
textrank_summary = textrank_summarizer.summarize(sentences, preprocessed_sentences, num_sentences)

combined_summarizer = CombinedSummarizer()
combined_summary = combined_summarizer.summarize(sentences, preprocessed_sentences, num_sentences)

bert_summarizer = BERTSummarizer()
bert_summary = bert_summarizer.summarize(text, num_sentences)

# Imprimir resúmenes
print(f"Resumen TF-IDF:\n{tfidf_summary}\n")
print(f"Resumen TextRank:\n{textrank_summary}\n")
print(f"Resumen Combinado:\n{combined_summary}\n")
print(f"Resumen BERT:\n{bert_summary}\n")

Resumen TF-IDF:
On the ground: Nick Kaldas, pictured with
then police minister John Watkins in 2006, warned on
his return from Iraq that year that 'Australia is
very much part of the international community and if
something happens in Palestine or Iraq, we have to
accept that it has an impact over here' In the lead
up to the inquiry, Ms Burn has denied falsifying
evidence in the warrant to Judge Bell to secretly bug
police, or to 'use illegal warrants to secretly
record conversations of my rivals in the police
force'.

Resumen TextRank:
Man of integrity:
One of the country's most distinguished police
officers, NSW deputy commissioner Nick Kaldas
(pictured) has worked in Iraq, locked up murderers
and trained with the FBI, but his stellar career has
been dogged by an illegal bugging operation
commandeered in 2000 by his rival for the top job .

Resumen Combinado:
NSW Deputy Police
Commissioner Nick Kaldas told a parliamentary inquiry
he has been punished for speaking out, including
being

## 7. Resultados

In [26]:
# Evaluar con ROUGE
tfidf_rouge = calculate_rouge_scores(reference_summary, tfidf_summary)
textrank_rouge = calculate_rouge_scores(reference_summary, textrank_summary)
combined_rouge = calculate_rouge_scores(reference_summary, combined_summary)
bert_rouge = calculate_rouge_scores(reference_summary, bert_summary)

# Crear tabla de resultados

data = {
    'Modelo': ['TF-IDF', 'TextRank', 'Combinado', 'BERT'],
    'ROUGE-1': [tfidf_rouge['ROUGE-1'], textrank_rouge['ROUGE-1'], combined_rouge['ROUGE-1'],
                bert_rouge['ROUGE-1']],
    'ROUGE-2': [tfidf_rouge['ROUGE-2'], textrank_rouge['ROUGE-2'], combined_rouge['ROUGE-2'],
                bert_rouge['ROUGE-2']],
    'ROUGE-L': [tfidf_rouge['ROUGE-L'], textrank_rouge['ROUGE-L'], combined_rouge['ROUGE-L'], bert_rouge['ROUGE-L']]
}

# Imprimir tabla de resultados
result = pd.DataFrame(data)
print(result.to_string(index=False, float_format='{:.4f}'.format))

   Modelo  ROUGE-1  ROUGE-2  ROUGE-L
   TF-IDF   0.2817   0.0284   0.1502
 TextRank   0.3314   0.0809   0.2057
Combinado   0.3218   0.1279   0.2184
     BERT   0.2420   0.0258   0.1529
