<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/5_RAG_System%2C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keybert
!pip install feedparser
!pip install pymupdf
!pip install requests

In [None]:
from keybert import KeyBERT
import feedparser
import urllib.parse
import requests
import fitz  # PyMuPDF
import os

In [None]:
# Initialize the keyword extractor
kw_model = KeyBERT()

In [None]:
def extract_topics(text, num_topics=3):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_topics)
    return [kw[0] for kw in keywords]

def get_top_arxiv_papers(topic, max_results=2):
    base_url = "http://export.arxiv.org/api/query?"
    encoded_topic = urllib.parse.quote(topic)
    query = f"search_query=all:{encoded_topic}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
    feed = feedparser.parse(base_url + query)

    papers = []
    for entry in feed.entries:
        paper = {
            "title": entry.title,
            "authors": [author.name for author in entry.authors],
            "summary": entry.summary,
            "published": entry.published,
            "link": entry.link,
            "pdf_url": entry.id.replace('abs', 'pdf') + '.pdf'
        }
        papers.append(paper)

    return papers

def download_pdf(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return True
    return False

def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        return full_text.strip()
    except Exception as e:
        return f"[Error extracting text: {e}]"

Download relevnt arxiv papers

In [None]:
input_text = """The central question in this election is really what kind of country we want to be and what kind of future we 'll build together
              Today is my granddaughter 's second birthday
              I think about this a lot
              we have to build an economy that works for everyone , not just those at the top
              we need new jobs , good jobs , with rising incomes
              I want us to invest in you
              I want us to invest in your future
              jobs in infrastructure , in advanced manufacturing , innovation and technology , clean , renewable energy , and small business
              most of the new jobs will come from small business
              We also have to make the economy fairer
              That starts with raising the national minimum wage and also guarantee , finally , equal pay for women 's work
              I also want to see more companies do profit-sharing"""

topics = extract_topics(input_text)
print(f"Detected topics: {topics}")

for topic in topics:
    print(f"\nTop papers for topic: {topic}")
    papers = get_top_arxiv_papers(topic)

    for i, paper in enumerate(papers, 1):
        print(f"\n--- Paper {i} ---")
        print(f"Title: {paper['title']}")
        print(f"Authors: {', '.join(paper['authors'])}")
        print(f"Published: {paper['published']}")
        print(f"Link: {paper['link']}")
        print(f"Summary: {paper['summary'][:300]}...")

        # Download PDF
        filename = f"paper_{i}_{topic.replace(' ', '_')}.pdf"
        if download_pdf(paper['pdf_url'], filename):
            print(f"PDF downloaded: {filename}")
            text = extract_text_from_pdf(filename)
            print(f"\nExtracted Text Preview:\n{text[:1000]}...\n")
            # os.remove(filename)  # Clean up
        else:
            print("⚠️ Failed to download PDF.")
