In [1]:
#install Dependencies
!pip install requests beautifulsoup4 transformers gtts scholarly PyPDF2

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl.metadata (7.4 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting arrow (from scholarly)
  Downloading arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting bibtexparser (from scholarly)
  Downloading bibtexparser-1.4.3.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fake-useragent (from scholarly)
  Downloading fake_useragent-2.1.0-py3-none-any.whl.metadata (17 kB)
Collecting free-proxy (from scholarly)
  Downloading free_proxy-1.1.3.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv (from scholarly)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting selenium (from scholar

In [2]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from gtts import gTTS
import re
from scholarly import scholarly
import PyPDF2
import time
import os

# 1. Paper Search and Discovery Agent (using arXiv)
def search_papers(query, num_results=5, filter_relevance=False, filter_recency=False, filter_year=None, filter_author=None, filter_journal=None):
    results = []

    base_url = f"https://arxiv.org/search/?query={query}&searchtype=all&abstracts=show&order=-announced_date_first"

    if filter_recency:
        base_url += "&order=-announced_date_first"
    if filter_year:
        base_url += f"&date-year={filter_year}"
    if filter_author:
        base_url += f"&author={filter_author}"
    if filter_journal:
        base_url += f"&journal={filter_journal}"

    try:
        response = requests.get(base_url, timeout=10)
        if response.status_code != 200:
            print(f"Error fetching data from arXiv. Status code: {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from arXiv: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    for entry in soup.find_all('li', class_='arxiv-result'):
        title = entry.find('p', class_='title').text.strip()
        link = entry.find('a')['href']
        summary = entry.find('p', class_='abstract').text.strip()
        results.append({"title": title, "link": f"https://arxiv.org{link}", "summary": summary})

    return results[:num_results]

# 2. Topic Classification Agent (using Hugging Face zero-shot classification)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_topic(text, candidate_labels):
    result = classifier(text, candidate_labels)
    return result

# 3. Summary Generation Agent (using Hugging Face transformers for summarization)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text, max_length=200, min_length=50):
    max_chunk_length = 1024
    chunk_size = max_chunk_length - 50
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summary[0]['summary_text'])

    full_summary = " ".join(summaries)
    return full_summary

# 4. Cross-paper Synthesis Agent (combining multiple paper summaries)
def cross_paper_synthesis(papers_summaries):
    combined_text = " ".join(papers_summaries)
    synthesis = summarize_text(combined_text)
    return synthesis

# 5. Audio Generation Agent (using gTTS to generate an audio podcast)
def text_to_audio(text, output_filename):
    tts = gTTS(text)
    tts.save(output_filename)

# 6. Citation Extraction Agent (using regex to find DOI)
def extract_doi(paper):
    doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?:[A-Z0-9])\b'
    match = re.search(doi_pattern, paper['link'])
    if match:
        return match.group(0)
    match = re.search(doi_pattern, paper['summary'])
    return match.group(0) if match else None

# 7. Paper Processing from File Uploads (handling PDF)
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

# 8. Multi-Agent System: Coordinating all agents in a single workflow
def multi_agent_system(query, file_path=None, topics=["Deep Learning", "Artificial Intelligence", "IoT"], filter_relevance=False, filter_recency=False, filter_year=None, filter_author=None, filter_journal=None):
    # Paper search and discovery agent
    papers = search_papers(query, filter_relevance=filter_relevance, filter_recency=filter_recency, filter_year=filter_year, filter_author=filter_author, filter_journal=filter_journal)

    # If no papers were found, return early
    if not papers:
        print("No papers found. Exiting the system.")
        return {"error": "No papers found."}

    # If file path is provided, process it (PDF, academic papers)
    text_from_pdf = ""
    if file_path:
        text_from_pdf = extract_text_from_pdf(file_path)
    else:
        text_from_pdf = " ".join([paper['summary'] for paper in papers])

    # Topic classification agent
    topic_result = classify_topic(text_from_pdf, topics)

    # Summary generation agent
    summarized_papers = [summarize_text(paper['summary']) for paper in papers]

    # Cross-paper synthesis agent
    cross_synthesis = cross_paper_synthesis([paper['summary'] for paper in papers])

    # Audio generation agent
    audio_file_path = 'final_podcast.mp3'
    text_to_audio(cross_synthesis, audio_file_path)

    # DOI extraction from the first paper's summary (if available)
    doi = extract_doi(papers[0]) if papers else None

    return {
        "papers": papers,
        "topic_result": topic_result,
        "summarized_papers": summarized_papers,  # Add the summarized papers here
        "cross_synthesis": cross_synthesis,
        "audio_file": audio_file_path,
        "doi": doi
    }

# 9. User Interaction and Flow
def user_interaction():
    # Ask user for a topic
    query = input("Enter the topic you want to search for: ")

    # Ask if the user has a paper to upload
    upload = input("Do you have a PDF to upload? (yes/no): ").strip().lower()
    file_path = None
    if upload == "yes":
        file_path = input("Enter the full file path of the PDF: ").strip()

    # Ask for filtering options
    filter_relevance = input("Filter by relevance? (yes/no): ").strip().lower() == 'yes'
    filter_recency = input("Filter by recency? (yes/no): ").strip().lower() == 'yes'
    filter_year = input("Enter year for filter (or press enter to skip): ").strip()
    filter_author = input("Enter author for filter (or press enter to skip): ").strip()
    filter_journal = input("Enter journal for filter (or press enter to skip): ").strip()

    # Ask for topic list from the user
    topics_input = input("Enter a comma-separated list of topics (e.g., Deep Learning, Artificial Intelligence, IoT): ").strip()
    topics = [topic.strip() for topic in topics_input.split(',')]

    # Run the multi-agent system
    result = multi_agent_system(query, file_path=file_path, topics=topics, filter_relevance=filter_relevance, filter_recency=filter_recency, filter_year=filter_year, filter_author=filter_author, filter_journal=filter_journal)

    # Check if there is an error
    if "error" in result:
        print(result["error"])
    else:
        # Output results
        print("Papers:")
        for paper in result['papers']:
            print(f"Title: {paper['title']}")
            print(f"Link: {paper['link']}")
            print(f"Summary: {paper['summary'][:200]}...")

        print("\nTopic Classification Result:")
        print(result['topic_result'])

        print("\nSummarized Papers:")
        for i, summary in enumerate(result['summarized_papers']):
            print(f"Summary {i+1}: {summary[:200]}...")

        print("\nCross-paper synthesis:")
        print(result['cross_synthesis'][:200])

        print(f"\nAudio File Path: {result['audio_file']}")

        if result['doi']:
            print(f"\nDOI from the first paper: {result['doi']}")
        else:
            print("\nNo DOI found for the first paper.")

# Run the user interaction flow
if __name__ == "__main__":
    user_interaction()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


Enter the topic you want to search for: Agriculture
Do you have a PDF to upload? (yes/no): yes
Enter the full file path of the PDF: /content/Farm_Intrusion_Detection_System_using_IoT.pdf
Filter by relevance? (yes/no): yes
Filter by recency? (yes/no): yes
Enter year for filter (or press enter to skip): 2024
Enter author for filter (or press enter to skip): 
Enter journal for filter (or press enter to skip): 
Enter a comma-separated list of topics (e.g., Deep Learning, Artificial Intelligence, IoT): Artificial Intelligence, Deep Learning, IoT and Explainable AI


Your max_length is set to 200, but your input_length is only 195. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)
Your max_length is set to 200, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Your max_length is set to 200, but your input_length is only 183. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=91)
Your max_length is set to 200, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Yo

Papers:
Title: Financial resilience of agricultural and food production companies in Spain: A compositional cluster analysis of the impact of the Ukraine-Russia war (2021-2023)
Link: https://arxiv.orghttps://arxiv.org/abs/2504.05912
Summary: Abstract:
      
        This study analyzes the financial resilience of agricultural and food production companies in Spain amid the Ukraine-Russia war using cluster analysis based on financial ratio...
Title: Graph Neural Networks for Enhancing Ensemble Forecasts of Extreme Rainfall
Link: https://arxiv.orghttps://arxiv.org/abs/2504.05471
Summary: Abstract:
      
        Climate change is increasing the occurrence of extreme precipitation events, threatening infrastructure, agriculture, and public safety. Ensemble prediction systems provide pr...
Title: Climate adaptation of millet and sorghum varieties in North-Eastern Senegal: cross-referencing rainfall, thermal and phenological parameters
Link: https://arxiv.orghttps://arxiv.org/abs/2504.04965