In [None]:
import os
import numpy as np
from bs4 import BeautifulSoup

def preprocess_text(text):
    text = text.strip().replace('\n', ' ')
    text = ' '.join(text.split())
    return text

text_dir = '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/text'
summaries_dir = '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/Summaries'

def load_text_and_summaries(text_dir, summaries_dir):
    documents = []
    gold_summaries = []
    file_names = []
    
    # Load text files
    for root, dirs, files in os.walk(text_dir):
        for file in files:
            summary_file = os.path.join(summaries_dir, file + '.txt')
            
            text_path = os.path.join(root, file)
            try:
                text = extract_text_from_sgml(text_path)
                text = preprocess_text(text) 
            except Exception as e:
                print(f"Error reading text file '{file}': {e}")
                continue
            
            # Read summary file
            try:
                with open(summary_file, 'r', encoding='utf-8') as f:
                    summary = f.read().replace('Abstract:', '', 1).strip()
                    summary = preprocess_text(summary)  
            except Exception as e:
                print(f"Error reading summary file '{file}': {e}")
                summary = ""
                continue
            
            documents.append(text)
            gold_summaries.append(summary)
            file_names.append(file)
    
    print(f"\nTotal documents loaded: {len(documents)}")
    return documents, gold_summaries, file_names

def extract_text_from_sgml(file_path):
    with open(file_path, 'r', encoding='latin1') as f:
        raw = f.read()
    soup = BeautifulSoup(raw, 'html.parser')
    text_tag = soup.find(lambda tag: tag.name and tag.name.lower() == "text")
    if text_tag is None:
        raise ValueError("No <text> tag found in the file")
    return text_tag.get_text()

documents, gold_summaries, file_names = load_text_and_summaries(text_dir, summaries_dir)

Error reading summary file 'LA070190-0073': [Errno 2] No such file or directory: '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/Summaries/LA070190-0073.txt'
Error reading summary file 'AP830325-0143': [Errno 2] No such file or directory: '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/Summaries/AP830325-0143.txt'
Error reading summary file 'LA071589-0076': [Errno 2] No such file or directory: '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/Summaries/LA071589-0076.txt'
Error reading summary file 'FBIS4-4674': [Errno 2] No such file or directory: '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/Summaries/FBIS4-4674.txt'
Error reading summary file 'AP880928-0054': [Errno 2] No such file or directory: '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/Summaries/AP880928-0054.txt'
Error reading summary file 'AP891116-0191': [Errno 2] No such file or directory: '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw5A/DUC2001/Summaries/AP891116-0191.txt'
Error read

In [2]:
documents[10]

"The health department said it is providing tuberculosis testing and treatment for the Human Resources Administration's program for the homeless, and will train staff members on tuberculosis prevention and control. The department also has an established residence for homeless tuberculosis patients, and is working with substance-abuse treatment services to extend tuberculosis prevention in its programs. The Board of Health approved a resolution last year requiring all children entering city schools to be tested. The Health Department estimates that one million New Yorkers may be infected by the TB germ. But only a fraction of a percent of those who have active tuberculosis disease can spread the infection to susceptible individuals. The germ is inactive in more than 99% of those infected. Those at high risk for contracting TB are people whose capacity for resisting infection is weakened, either through diseases such as HIV infection, by drug or alcohol abuse, serious illness such as can

In [3]:
gold_summaries[10]

'The New York City Health Commissioner said that the number of active cases of tuberculosis in the city in 1990 totaled 3,520, an increase of 38 percent over 1989. The Department said that it was instituting new cure programs, and taking measures to improve tuberculosis prevention programs. Children must be tested before entering school, and homeless centers and substance-abuse centers are getting special attention. The greatest concentration of new cases was in the 25-44 age group. The number of men in the caseload was twice that the number of women. Introduction: NEW YORK -- The incidence of active tuberculosis cases in the city rose 38% in 1990, to 3,520 cases, according to the health commissioner. The commissioner, Dr. Woodrow A. Myers Jr., said that although the increase causes concern, the city Department of Health is "acting aggressively to halt the epidemic." The health department has a high cure rate in its tuberculosis clinics, with more than 70% successfully completing thera

In [4]:
file_names[10]

'WSJ910304-0002'

In [8]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string
import math
from itertools import combinations

nltk.download('punkt')
nltk.download('stopwords')

#word distribution
def get_word_distribution(text):
    words = word_tokenize(text.lower())
    words = [w for w in words if w not in stopwords.words('english')]
    words = [w for w in words if w not in string.punctuation]
    total = len(words)
    counter = Counter(words)
    return {w: c / total for w, c in counter.items()} if total > 0 else {}

def kl_divergence(p, q, epsilon=1e-6):
    vocab = set(p.keys()).union(set(q.keys()))
    return sum(p[w] * math.log(p[w] / q.get(w, epsilon)) for w in vocab if p[w] > 0)

def klsum_word_based_greedy(text, max_sentences=3):
    sentences = sent_tokenize(text)
    if len(sentences) <= max_sentences:
        return ' '.join(sentences)
    
    PD = get_word_distribution(text)
    selected = []
    remaining = [s.strip() for s in sentences]

    for _ in range(max_sentences):
        best_sent = None
        best_score = float('inf')

        for sent in remaining:
            candidate_summary = ' '.join(selected + [sent])
            PS = get_word_distribution(candidate_summary)
            score = kl_divergence(PD, PS)

            if score < best_score:
                best_score = score
                best_sent = sent

        if best_sent:
            selected.append(best_sent)
            remaining.remove(best_sent)
    
    return ' '.join(selected)

[nltk_data] Downloading package punkt to /Users/hunjunsin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hunjunsin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
documents[10]

"The health department said it is providing tuberculosis testing and treatment for the Human Resources Administration's program for the homeless, and will train staff members on tuberculosis prevention and control. The department also has an established residence for homeless tuberculosis patients, and is working with substance-abuse treatment services to extend tuberculosis prevention in its programs. The Board of Health approved a resolution last year requiring all children entering city schools to be tested. The Health Department estimates that one million New Yorkers may be infected by the TB germ. But only a fraction of a percent of those who have active tuberculosis disease can spread the infection to susceptible individuals. The germ is inactive in more than 99% of those infected. Those at high risk for contracting TB are people whose capacity for resisting infection is weakened, either through diseases such as HIV infection, by drug or alcohol abuse, serious illness such as can

In [10]:
from math import ceil
from nltk.tokenize import sent_tokenize

document = documents[10]

sentences = sent_tokenize(document)
num_sentences = len(sentences)

max_sentences = ceil(num_sentences * 0.25)

summary = klsum_word_based_greedy(document, max_sentences=max_sentences)

print("Total sentences :", num_sentences)
print("Summary sentences (25%):", max_sentences)
print("Generated summary:\n", summary)

Total sentences : 9
Summary sentences (25%): 3
Generated summary:
 The health department said it is providing tuberculosis testing and treatment for the Human Resources Administration's program for the homeless, and will train staff members on tuberculosis prevention and control. Those at high risk for contracting TB are people whose capacity for resisting infection is weakened, either through diseases such as HIV infection, by drug or alcohol abuse, serious illness such as cancer, or by poor nutrition. The Health Department estimates that one million New Yorkers may be infected by the TB germ.


In [16]:
import os
from tqdm import tqdm
from math import ceil
from nltk.tokenize import sent_tokenize
from rouge import Rouge

rouge = Rouge()

rouge_scores = []

min_tokens = 5

for i, (doc_text, gold_summary, fileName) in enumerate(tqdm(zip(documents, gold_summaries, file_names), total=len(documents))):

    sentences = sent_tokenize(doc_text)
    num_sentences = len(sentences)
    max_sentences = ceil(num_sentences * 0.25)

    try:
        pred_summary = klsum_word_based_greedy(doc_text, max_sentences=max_sentences)
    except Exception as e:
        continue

    pred_summary_clean = str(pred_summary).strip().replace('\n', ' ')
    gold_summary_clean = str(gold_summary).strip().replace('\n', ' ')


    if len(gold_summary_clean.split()) < min_tokens:
        print(f"Skipping file '{fileName}' due to short gold summary.")
        continue
    if len(pred_summary_clean.split()) < min_tokens:
        print(f"Skipping file '{fileName}' due to short predicted summary.")
        continue
    
    scores = rouge.get_scores(pred_summary_clean, gold_summary_clean)[0]  
    rouge_scores.append((fileName, scores))

if rouge_scores:
    all_scores = [s for _, s in rouge_scores]
    avg_scores = {
        "rouge-1": {"f": 0, "p": 0, "r": 0},
        "rouge-2": {"f": 0, "p": 0, "r": 0},
        "rouge-l": {"f": 0, "p": 0, "r": 0},
    }

    for s in all_scores:
        for key in avg_scores:
            for metric in avg_scores[key]:
                avg_scores[key][metric] += s[key][metric]

    for key in avg_scores:
        for metric in avg_scores[key]:
            avg_scores[key][metric] /= len(all_scores)

    print("\n[Average ROUGE score]")
    for key in avg_scores:
        f1 = avg_scores[key]["f"]
        print(f"{key.upper()} F1: {f1:.4f}")



100%|██████████| 301/301 [16:00<00:00,  3.19s/it]  


[Average ROUGE score]
ROUGE-1 F1: 0.6064
ROUGE-2 F1: 0.5027
ROUGE-L F1: 0.6060





B: Topic distribution

Train LDA

In [18]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string

processed_docs = [preprocess_string(doc) for doc in documents] # tokenize, lemmitize, stopword

dictionary = Dictionary(processed_docs) # word : id
corpus = [dictionary.doc2bow(doc) for doc in processed_docs] # bag of words

lda_model = LdaModel(corpus, num_topics=30, id2word=dictionary, passes=10)

In [24]:
documents[10]

"\n   The health department said it is providing tuberculosis\ntesting and treatment for the Human Resources\nAdministration's program for the homeless, and will train\nstaff members on tuberculosis prevention and control. The\ndepartment also has an established residence for homeless\ntuberculosis patients, and is working with substance-abuse\ntreatment services to extend tuberculosis prevention in its\nprograms. The Board of Health approved a resolution last year\nrequiring all children entering city schools to be tested.\n   The Health Department estimates that one million New\nYorkers may be infected by the TB germ. But only a fraction\nof a percent of those who have active tuberculosis disease\ncan spread the infection to susceptible individuals. The germ\nis inactive in more than 99% of those infected.\n   Those at high risk for contracting TB are people whose\ncapacity for resisting infection is weakened, either through\ndiseases such as HIV infection, by drug or alcohol abuse,\

In [23]:
processed_docs[10]

['health',
 'depart',
 'said',
 'provid',
 'tuberculosi',
 'test',
 'treatment',
 'human',
 'resourc',
 'administr',
 'program',
 'homeless',
 'train',
 'staff',
 'member',
 'tuberculosi',
 'prevent',
 'control',
 'depart',
 'establish',
 'resid',
 'homeless',
 'tuberculosi',
 'patient',
 'work',
 'substanc',
 'abus',
 'treatment',
 'servic',
 'extend',
 'tuberculosi',
 'prevent',
 'program',
 'board',
 'health',
 'approv',
 'resolut',
 'year',
 'requir',
 'children',
 'enter',
 'citi',
 'school',
 'test',
 'health',
 'depart',
 'estim',
 'million',
 'new',
 'yorker',
 'infect',
 'germ',
 'fraction',
 'percent',
 'activ',
 'tuberculosi',
 'diseas',
 'spread',
 'infect',
 'suscept',
 'individu',
 'germ',
 'inact',
 'infect',
 'high',
 'risk',
 'contract',
 'peopl',
 'capac',
 'resist',
 'infect',
 'weaken',
 'diseas',
 'hiv',
 'infect',
 'drug',
 'alcohol',
 'abus',
 'ill',
 'cancer',
 'poor',
 'nutrit',
 'greatest',
 'concentr',
 'tuberculosi',
 'ag',
 'group',
 'account',
 'total',
 '

In [3]:
lda_model.print_topics(num_words=5)

[(1,
  '0.015*"diamond" + 0.011*"mine" + 0.009*"finsch" + 0.009*"jwaneng" + 0.009*"underground"'),
 (0,
  '0.023*"bank" + 0.015*"world" + 0.011*"sai" + 0.011*"nafta" + 0.010*"countri"'),
 (28,
  '0.016*"welfar" + 0.015*"build" + 0.012*"air" + 0.012*"ventil" + 0.008*"year"'),
 (26,
  '0.018*"polic" + 0.012*"suspect" + 0.011*"said" + 0.011*"brutal" + 0.009*"offic"'),
 (12,
  '0.044*"thoma" + 0.016*"limit" + 0.016*"court" + 0.015*"term" + 0.011*"black"'),
 (17,
  '0.018*"race" + 0.014*"marathon" + 0.012*"second" + 0.009*"hale" + 0.009*"davi"'),
 (27,
  '0.022*"said" + 0.022*"hurrican" + 0.008*"wind" + 0.007*"center" + 0.006*"sheet"'),
 (6,
  '0.026*"eclips" + 0.019*"said" + 0.012*"offic" + 0.009*"smith" + 0.009*"polic"'),
 (14,
  '0.018*"marathon" + 0.015*"said" + 0.012*"race" + 0.011*"year" + 0.010*"mile"'),
 (23,
  '0.030*"taylor" + 0.022*"said" + 0.014*"hospit" + 0.011*"doctor" + 0.010*"pneumonia"'),
 (29,
  '0.018*"path" + 0.017*"shine" + 0.016*"count" + 0.016*"censu" + 0.014*"said"')

Get topic distribution of the document = PD

In [19]:
import numpy as np
from scipy.stats import entropy

def get_topic_distribution(text, lda_model, dictionary, num_topics=30):
    tokens = preprocess_string(text)
    bow = dictionary.doc2bow(tokens)
    topic_dist = np.zeros(num_topics)
    for topic_id, prob in lda_model.get_document_topics(bow):
        topic_dist[topic_id] = prob
    return topic_dist

In [20]:
from itertools import combinations
from nltk.tokenize import sent_tokenize

def klsum_greedy(text, lda_model, dictionary, num_topics=30, max_sentences=10):
    sentences = sent_tokenize(text)
    pd_topic = get_topic_distribution(text, lda_model, dictionary, num_topics)
    
    selected = []
    remaining = sentences.copy()
    remaining = [s.strip() for s in sentences]
    
    for _ in range(max_sentences):
        best_sent = None
        best_score = float('inf')
        
        for sent in remaining:
            candidate = ' '.join(selected + [sent])
            ps_topic = get_topic_distribution(candidate, lda_model, dictionary, num_topics)
            score = entropy(ps_topic, pd_topic)
            if score < best_score:
                best_score = score
                best_sent = sent
        
        if best_sent:
            selected.append(best_sent)
            remaining.remove(best_sent)

    return ' '.join(selected)

In [None]:
from tqdm import tqdm
from rouge import Rouge

rouge = Rouge()

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for doc_text, gold_summary in tqdm(zip(documents, gold_summaries), total=len(documents)):
    try:
        sentences = sent_tokenize(doc_text)
        num_sentences = len(sentences)
        max_sentences = ceil(num_sentences * 0.25)
        pred_summary = klsum_greedy(doc_text, lda_model, dictionary, num_topics=30, max_sentences=max_sentences)

        pred_summary_clean = pred_summary.strip().replace('\n', ' ')
        gold_summary_clean = gold_summary.strip().replace('\n', ' ')

        results = rouge.get_scores(pred_summary_clean, gold_summary_clean)[0]

        rouge1_scores.append(results['rouge-1']['f'])
        rouge2_scores.append(results['rouge-2']['f'])
        rougeL_scores.append(results['rouge-l']['f'])

    except Exception as e:
        print(f"[SKIP] error: {e}")
        continue

avg_r1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
avg_r2 = sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0
avg_rl = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0

print("\nKL-Sum (Topic distribution) - average ROUGE score:")
print(f"ROUGE-1     average: {avg_r1:.4f}")
print(f"ROUGE-2     average: {avg_r2:.4f}")
print(f"ROUGE-L     average: {avg_rl:.4f}")

100%|██████████| 301/301 [01:50<00:00,  2.72it/s]


KL-Sum (Topic distribution) - average ROUGE score:
ROUGE-1     average: 0.4595
ROUGE-2     average: 0.3584
ROUGE-L     average: 0.4590





Word distribution Rouge score:

ROUGE-1 F1: 0.6064
ROUGE-2 F1: 0.5027
ROUGE-L F1: 0.6060

Topic distribution Rouge score:

ROUGE-1     average: 0.4595
ROUGE-2     average: 0.3584
ROUGE-L     average: 0.4590

Word distribution-based summarization tends to have higher ROUGE scores due to more direct word overlap with the reference summaries.

Topic distribution-based summarization focuses on capturing key ideas or themes, which may result in lower surface-level word overlap and thus lower ROUGE scores.

Since ROUGE evaluates lexical overlap, so the score of topic-based summaries may have less than word-based summaries