In [54]:
!pip install arxiv
!pip install markdownify
!pip install pymupdf
!pip install pdfplumber
!pip install reportlab

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m929.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pypdfium2>=4.18.0
  Downloading pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pdfminer.six==20250506
  Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.1
Collecting reportlab
  Downloading reportlab-4.4.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [3

In [3]:
import subprocess
import networkx as nx

# -----------------------------
# Local LLM query function (Ollama)
# -----------------------------
def query_local_llm(prompt, model="llama3"):
    command = f'echo "{prompt}" | ollama run {model}'
    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return result.stdout.decode()

In [5]:
prompt = """Give me a simple python code"""
query_local_llm(prompt)

'Here is a very simple Python program that prints "Hello, World!" to the screen:\n```\nprint("Hello, World!")\n```\nThis is often referred to as the "Hello World" program because it\'s a classic example of a first program in many programming languages.\n\nIf you want to run this code, just save it to a file with a `.py` extension (e.g. `hello.py`) and then run it using Python (e.g. `python hello.py`). The output will be:\n```\nHello, World!\n```\nLet me know if you have any questions or if you\'d like to see more examples!\n\n'

In [10]:
import arxiv

def get_arxiv_papers(query, max_results=5):
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    client = arxiv.Client()
    papers = []
    for result in client.results(search):
        papers.append({
            "title": result.title,
            "abstract": result.summary,
            "url": result.pdf_url
        })
    return papers


In [12]:
query_topic = """Topological Machine Learning"""
print(f"🔎 Retrieving papers on: {query_topic}\n")
papers = get_arxiv_papers(query_topic, max_results=3)
papers

🔎 Retrieving papers on: Topological Machine Learning



[{'title': 'How to Organize your Deep Reinforcement Learning Agents: The Importance of Communication Topology',
  'abstract': 'In this empirical paper, we investigate how learning agents can be arranged\nin more efficient communication topologies for improved learning. This is an\nimportant problem because a common technique to improve speed and robustness of\nlearning in deep reinforcement learning and many other machine learning\nalgorithms is to run multiple learning agents in parallel. The standard\ncommunication architecture typically involves all agents intermittently\ncommunicating with each other (fully connected topology) or with a centralized\nserver (star topology). Unfortunately, optimizing the topology of communication\nover the space of all possible graphs is a hard problem, so we borrow results\nfrom the networked optimization and collective intelligence literatures which\nsuggest that certain families of network topologies can lead to strong\nimprovements over fully-con

In [65]:
import fitz
pdf_path = "./tmp/paper_2103.08134v1.pdf"
doc = fitz.open(pdf_path)
full_text = ""
for page in doc:
    full_text += page.get_text()
doc.close()
full_text
full_text_arr = full_text.split()
len(full_text_arr)



9257

In [83]:
import pdfplumber
import re
from collections import defaultdict

ROMANS = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
          'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX']

# === Step 1: Extract all lines with font sizes ===
def extract_lines_with_fonts(pdf_path):
    lines = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            words = page.extract_words(extra_attrs=['size'])
            grouped = defaultdict(list)
            for w in words:
                grouped[round(w['top'], 1)].append(w)

            for line_top in sorted(grouped):
                line_words = grouped[line_top]
                text = ' '.join(w['text'] for w in line_words)
                sizes = [round(w['size'], 2) for w in line_words]
                common_size = max(set(sizes), key=sizes.count)  # most frequent size in the line
                lines.append({
                    "page": page_num,
                    "text": text.strip(),
                    "font_size": common_size
                })
    return lines

# === Step 2: Detect 'Introduction' and extract prefix + font size ===
def detect_intro_prefix_and_font(lines):
    for line in lines:
        match = re.match(r"^(\S+)?\.?\s*Introduction", line["text"], re.IGNORECASE)
        if match:
            print(line)
            prefix = match.group(1).upper()
            print(f"✅ Found 'Introduction': {line['text']} (Page {line['page']}) Font: {line['font_size']}")
            return prefix, line['font_size'], line['text'], line['page']
    return None, None, None, None

# === Step 3: Classify prefix type ===
def classify_prefix_type(prefix):
    if not prefix:
        return None
    prefix = prefix.strip(".").upper()
    if prefix.isdigit():
        return "numeric"
    elif prefix in ROMANS:
        return "roman"
    elif re.match(r"^[A-Z]$", prefix):
        return "alpha"
    return None

# === Step 4: Strict matching with same prefix type and font ===
def extract_strict_headings(lines, prefix_type, intro_prefix, intro_font_size):
    headings = []
    intro_prefix = intro_prefix.strip(".").upper()
    font_tol = 0.4  # allow for small float difference in font size

    if prefix_type == "numeric":
        current = int(intro_prefix)
        next_expected = current + 1
        while True:
            pattern = re.compile(rf"^{next_expected}\.?\s+[A-Za-z].+")
            found = False
            for line in lines:
                if pattern.match(line["text"]) and abs(line["font_size"] - intro_font_size) <= font_tol:
                    headings.append({"page": line["page"], "heading": line["text"]})
                    next_expected += 1
                    found = True
                    break
            if not found:
                break

    elif prefix_type == "roman":
        current_idx = ROMANS.index(intro_prefix)
        for next_roman in ROMANS[current_idx + 1:]:
            pattern = re.compile(rf"^{next_roman}\.?\s+[A-Za-z].+")
            for line in lines:
                if pattern.match(line["text"]) and abs(line["font_size"] - intro_font_size) <= font_tol:
                    headings.append({"page": line["page"], "heading": line["text"]})
                    break

    elif prefix_type == "alpha":
        current = ord(intro_prefix)
        for i in range(current + 1, ord('Z') + 1):
            next_char = chr(i)
            pattern = re.compile(rf"^{next_char}\.?\s+[A-Za-z].+")
            for line in lines:
                if pattern.match(line["text"]) and abs(line["font_size"] - intro_font_size) <= font_tol:
                    headings.append({"page": line["page"], "heading": line["text"]})
                    break

    return headings

# === Step 5: Main runner ===
def extract_headings_with_fontmatch(pdf_path):
    lines = extract_lines_with_fonts(pdf_path)
    prefix, font_size, intro_text, intro_page = detect_intro_prefix_and_font(lines)

    if not prefix:
        print("❌ Could not find 'Introduction' heading.")
        return []

    prefix_type = classify_prefix_type(prefix)
    if not prefix_type:
        print(f"❌ Could not classify prefix '{prefix}'")
        return []

    print(f"📌 Detected prefix type: {prefix_type.upper()} | Font size: {font_size}")

    headings = [{"page": intro_page, "heading": intro_text}]
    headings += extract_strict_headings(lines, prefix_type, prefix, font_size)

    return headings

# === Example usage ===
pdf_path = "./tmp/paper_1808.08210v3.pdf"
headings = extract_headings_with_fontmatch(pdf_path)

# === Print results ===
if not headings:
    print("⚠️ No section headings found.")
else:
    print("\n📄 Strict section headings (font & pattern matched):\n")
    for h in headings:
        print(f"Page {h['page']:>3} | {h['heading']}")


❌ Could not find 'Introduction' heading.
⚠️ No section headings found.


In [62]:
import os
import fitz 
import pdfplumber
from collections import Counter
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Preformatted



def extract_header_fontsize_from_pdf(pdf_path):
    font_size_counter = Counter()

    with pdfplumber.open(pdf_path) as pdf:
        for i in range(len(pdf.pages)):
            # lines1 = pdf.pages[i].extract_text().split('\n')
            words = pdf.pages[i].extract_words(extra_attrs=['fontname', 'size'])
            lines = {}

            for word in words:
                line_num = word['top']
                if line_num not in lines:
                    lines[line_num] = []
                lines[line_num].append(word)

            for line_words in lines.values():
                font_size_counter[line_words[0]['size']] += 1

    # Find the font sizes that were used more than once
    repeated_sizes = [size for size, count in font_size_counter.items() if count > 3]
    print(repeated_sizes)
    # Return the highest font size among the repeated sizes
    if repeated_sizes:
        return max(repeated_sizes)
    else:
        return None


def extract_lines_with_font_size(pdf_path, target_font_size):
    lines_with_target_font_size = []

    with pdfplumber.open(pdf_path) as pdf:
        for i in range(len(pdf.pages)):
            words = pdf.pages[i].extract_words(extra_attrs=['fontname', 'size'])
            lines = {}

            for word in words:
                line_num = word['top']
                if line_num not in lines:
                    lines[line_num] = []
                lines[line_num].append(word)

            for line_num, line_words in lines.items():
                line_font_sizes = [word['size'] for word in line_words]
                if target_font_size in line_font_sizes:
                    line_text = ' '.join([word['text'] for word in line_words])
                    lines_with_target_font_size.append(line_text)

    return lines_with_target_font_size

pdf1 = "./tmp/paper_1501.03755v2.pdf"
extracted_font_size = extract_header_fontsize_from_pdf(pdf1)
extracted_headers = extract_lines_with_font_size(pdf1,extracted_font_size)
print(extracted_headers)

[11.95519999999999, 9.962599999999952, 9.962600000000066, 9.962600000000009, 9.962599999999995, 10.0, 6.973799999999983, 4.981300000000033, 9.96259999999998, 6.973800000000011, 6.973799999999997]
['SCREENCONTENTIMAGESEGMENTATIONUSINGLEASTABSOLUTEDEVIATION', 'FITTING', 'DepartmentofElectricalandComputerEngineering,PolytechnicSchoolofEngineering,', 'NewYorkUniversity,NY,USA.']


In [84]:
# Install arxiv module if not installed
# pip install arxiv

import arxiv

def search_papers(topic, max_results=5):
    """
    Search arXiv for papers matching the topic.
    """
    search = arxiv.Search(
        query=topic,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
        sort_order=arxiv.SortOrder.Descending,
    )
    results = []
    for result in search.results():
        result.download_pdf(dirpath="./tmp", filename=f"paper_{result.get_short_id()}.pdf")
        results.append({
            "title": result.title,
            "summary": result.summary,
            "authors": [a.name for a in result.authors],
            "published": result.published.strftime('%Y-%m-%d'),
            "link": result.entry_id,
        })
    return results

def summarize_paper_with_cot(paper, model="llama3"):
    """
    Use local LLM (Ollama) to generate summary with chain-of-thought.
    """
    prompt = f"""
You are a helpful AI research assistant. Given the following paper abstract, please do:

1. Summarize the main contribution.
2. Describe what methods are used and why they might be effective.
3. Explain your reasoning step by step (chain of thought) so it is clear how you interpreted the abstract.
4. Suggest possible future directions or improvements.

### Abstract:
{paper["summary"]}

Respond in the following format:

Title: <Title>
Main Contribution: <...>
Methods Used: <...>
Chain of Thought: <...>
Future Directions: <...>
"""
    command = f'echo "{prompt}" | ollama run {model}'
    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return result.stdout.decode()

def generate_report(topic, max_results=3):
    """
    Main function to generate the literature review report.
    """
    papers = search_papers(topic, max_results=max_results)
    report = f"# 📄 Literature Review on: {topic}\n\n"
    print(papers)
    for idx, paper in enumerate(papers, 1):
        report += f"## 📝 Paper {idx}: {paper['title']}\n"
        report += f"- **Authors:** {', '.join(paper['authors'])}\n"
        report += f"- **Published:** {paper['published']}\n"
        report += f"- **Link:** {paper['link']}\n\n"
        print(f"Summarizing: {paper['title']} ...")
        # summary = summarize_paper_with_cot(paper)
        # report += summary + "\n\n---\n\n"

    return report

if __name__ == "__main__":
    topic = "What are the loss functions used in CNN modelling ?"
    final_report = generate_report(topic, max_results=5)

    # Save to file
    with open("literature_review.md", "w") as f:
        f.write(final_report)

    print("\n✅ Done! Your literature review is saved as 'literature_review.md'.")


  for result in search.results():


[]

✅ Done! Your literature review is saved as 'literature_review.md'.


In [89]:
import requests
import fitz  # PyMuPDF
from typing import List, Dict

class PaperLoader:
    def __init__(self, api_key: str = None):
        self.base_url = "https://api.semanticscholar.org/graph/v1"
        self.api_key = api_key  # Optional, but increases limits

    def fetch_for_query(self, query: str, max_results: int = 3) -> List[Dict]:
        """
        Fetch relevant papers from Semantic Scholar for a given query.
        """
        headers = {}
        if self.api_key:
            headers["x-api-key"] = self.api_key
        
        url = f"{self.base_url}/paper/search"
        params = {
            "query": query,
            "limit": max_results,
            "fields": "title,url,abstract,authors,year,openAccessPdf"
        }

        response = requests.get(url, headers=headers, params=params)
        print(response)
        response.raise_for_status()
        data = response.json()
        print(data)
        results = []
        for paper in data.get("data", []):
            if paper.get("openAccessPdf") and paper["openAccessPdf"].get("url"):
                results.append(self._process_result(paper))

        return results

    def _process_result(self, paper: Dict) -> Dict:
        """
        Process a single paper entry from Semantic Scholar.
        """
        pdf_url = paper["openAccessPdf"]["url"] if paper.get("openAccessPdf") else None
        return {
            "title": paper["title"],
            "text": self._extract_text(pdf_url) if pdf_url else "",
            "metadata": {
                "authors": [a["name"] for a in paper.get("authors", [])],
                "published": paper.get("year"),
                "abstract": paper.get("abstract"),
                "url": paper.get("url")
            }
        }

    def _extract_text(self, pdf_url: str) -> str:
        """
        Download and extract text from an open-access PDF.
        """
        try:
            response = requests.get(pdf_url, timeout=20)
            response.raise_for_status()
            with fitz.open(stream=response.content, filetype="pdf") as doc:
                return "\n".join(page.get_text() for page in doc)
        except Exception as e:
            print(f"Failed to fetch PDF from {pdf_url}: {e}")
            return ""


In [96]:
loader = PaperLoader()
topic ="loss function cnn"


papers = loader.fetch_for_query(topic)[:3]
len(papers)
        

<Response [200]>
{'total': 610199, 'offset': 0, 'next': 3, 'data': [{'paperId': '376aad5815f808c82e518956a70091bf828dbd25', 'url': 'https://www.semanticscholar.org/paper/376aad5815f808c82e518956a70091bf828dbd25', 'title': 'Person Re-identification by Multi-Channel Parts-Based CNN with Improved Triplet Loss Function', 'year': 2016, 'openAccessPdf': {'url': '', 'status': 'CLOSED', 'license': None, 'disclaimer': "Notice: The following paper fields have been elided by the publisher: {'abstract'}. Paper or abstract available at https://api.unpaywall.org/v2/10.1109/CVPR.2016.149?email=<INSERT_YOUR_EMAIL> or https://doi.org/10.1109/CVPR.2016.149, which is subject to the license by the author or copyright owner provided with this content. Please go to the source to verify the license and copyright information for your use."}, 'authors': [{'authorId': '145067864', 'name': 'De Cheng'}, {'authorId': '144768792', 'name': 'Yihong Gong'}, {'authorId': '3373601', 'name': 'Sanping Zhou'}, {'authorId':

2

In [98]:
from arxiv import Search, SortCriterion
import fitz  # PyMuPDF
from typing import List, Dict

class PaperLoader:
    def fetch_for_query(self, query: str) -> List[Dict]:
        search = Search(
            query=query,
            max_results=3,
            sort_by=SortCriterion.Relevance
        )
        return [self._process_result(r) for r in search.results()]
    
    def _process_result(self, result) -> Dict:
        return {
            "title": result.title,
            "text": self._extract_text(result.pdf_url),
            "metadata": {
                "authors": [a.name for a in result.authors],
                "published": result.published
            }
        }
    
    def _extract_text(self, pdf_url: str) -> str:
        import requests
        response = requests.get(pdf_url)
        with fitz.open(stream=response.content, filetype="pdf") as doc:
            return "\n".join(page.get_text() for page in doc)
        
loader = PaperLoader()
topic ="loss function cnn"


papers = loader.fetch_for_query(topic)[:3]
print(len(papers))
papers

  return [self._process_result(r) for r in search.results()]


3


[{'title': 'Improving Interpretability and Accuracy in Neuro-Symbolic Rule Extraction Using Class-Specific Sparse Filters',
  'metadata': {'authors': ['Parth Padalkar',
    'Jaeseong Lee',
    'Shiyi Wei',
    'Gopal Gupta'],
   'published': datetime.datetime(2025, 1, 28, 3, 22, 23, tzinfo=datetime.timezone.utc)}},
 {'title': 'Incremental Boosting Convolutional Neural Network for Facial Action Unit Recognition',
  'text': 'Incremental Boosting Convolutional Neural Network\nfor Facial Action Unit Recognition\nShizhong Han, Zibo Meng, Ahmed Shehab Khan, Yan Tong\nDepartment of Computer Science & Engineering, University of South Carolina, Columbia, SC\n{han38, mengz, akhan}@email.sc.edu, tongy@cse.sc.edu\nAbstract\nRecognizing facial action units (AUs) from spontaneous facial expressions is still\na challenging problem. Most recently, CNNs have shown promise on facial AU\nrecognition. However, the learned CNNs are often overﬁtted and do not gener-\nalize well to unseen subjects due to lim

In [18]:
# --------------------
# Install if needed
# pip install arxiv markdownify ollama
# --------------------
import arxiv
from markdownify import markdownify as md

# Optional: if using Ollama, pip install ollama

# -------------------------
# Context Manager
# -------------------------
class ContextManager:
    def __init__(self):
        self.contexts = {}

    def add(self, key, text):
        self.contexts[key] = text

    def get(self, key):
        return self.contexts.get(key, "")

    def get_full_context(self):
        return "\n\n".join([f"### {k}\n{text}" for k, text in self.contexts.items()])

# -------------------------
# Search and get abstract
# -------------------------
def search_papers(topic, max_results=1):
    search = arxiv.Search(
        query=topic,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
    )
    results = []
    for result in search.results():
        results.append({
            "title": result.title,
            "summary": result.summary,
            "pdf_url": result.pdf_url,
        })
    return results

# -------------------------
# Ollama helper
# -------------------------
def ask_model(prompt, model="llama3"):
    command = f'echo "{prompt}" | ollama run {model}'
    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return result.stdout.decode()

# -------------------------
# Chain-of-thought MCP steps
# -------------------------
def analyze_paper_with_mcp(paper):
    context = ContextManager()

    abstract = paper["summary"]

    # Step 1: Extract methods
    prompt1 = f"""
Given the following abstract:

{abstract}

Extract only the METHODS used (e.g., model architectures, algorithms, datasets, evaluation protocols).
"""
    methods = ask_model(prompt1)
    context.add("Methods", methods)

    # Step 2: Extract results & conclusions
    prompt2 = f"""
Given the abstract and methods:

Abstract: {abstract}

Methods: {methods}

Summarize the main RESULTS and CONCLUSIONS.
"""
    results = ask_model(prompt2)
    context.add("Results", results)

    # Step 3: Analyze limitations and gaps
    full_context = context.get_full_context()
    prompt3 = f"""
Based on the following context:

{full_context}

Analyze and list possible LIMITATIONS and potential GAPS or missing areas in this research.
"""
    gaps = ask_model(prompt3)
    context.add("Gaps", gaps)

    # Step 4: Suggest new experiments
    full_context = context.get_full_context()
    prompt4 = f"""
Based on all the above context:

{full_context}

Propose NEW EXPERIMENTS or RESEARCH DIRECTIONS that could build upon this work. Explain why they could be valuable.
"""
    suggestions = ask_model(prompt4)
    context.add("Suggestions", suggestions)

    return context

# -------------------------
# Save to markdown
# -------------------------
def save_markdown(context, title):
    md_text = md(context.get_full_context())
    with open("literature_review_mcp.md", "w") as f:
        f.write(f"# {title}\n\n")
        f.write(md_text)

# -------------------------
# Main run
# -------------------------
if __name__ == "__main__":
    topic = "Diffusion models for medical image segmentation"
    papers = search_papers(topic, max_results=1)

    if papers:
        paper = papers[0]
        context = analyze_paper_with_mcp(paper)
        save_markdown(context, paper["title"])
        print("✅ Literature review with MCP completed! Check literature_review_mcp.md")
    else:
        print("❌ No papers found.")


  for result in search.results():


✅ Literature review with MCP completed! Check literature_review_mcp.md


In [103]:
import subprocess

def rewrite_query_with_llm(user_query):
    prompt = f"""
    You are an AI assistant for academic search.
    ==============================================
    Rewrite the following query into 3–5 concise academic search strings
    optimized for finding relevant research papers in scientific databases.
    Expand acronyms, include common synonyms and dont include any bollean expressions like (AND, OR).
    ==============================================
    Query: {user_query}
    ==============================================
    Output should be an array of strings and nothing else.
    """
    result = subprocess.run(
        ["ollama", "run", "llama3.1:8b-instruct-q5_K_M"],
        input=prompt.encode(),
        capture_output=True
    )
    output = result.stdout.decode().strip()
    queries = [q.strip() for q in output.split("\n") if q.strip()]
    return queries

# Example usage
queries = rewrite_query_with_llm("What are the loss functions used in CNN modelling?")
print("Generated Search Queries:")

loader = PaperLoader()

for q in queries:
    print("-", q)


    papers = loader.fetch_for_query(q)
    print(papers)

Generated Search Queries:
- ["loss functions in convolutional neural networks",


  return [self._process_result(r) for r in search.results()]


[]
- "optimization metrics for cnn models",
[]
- "performance evaluation criteria for deep learning architectures",
[]
- "convolutional neural network loss functions",
[]
- "cnn model evaluation metrics"]
[]


In [121]:
import requests
import fitz  # PyMuPDF
from scholarly import scholarly
from arxiv import Search, SortCriterion
from typing import List, Dict

class MultiSourcePaperLoader:
    def __init__(self, semantic_api_key: str = None):
        self.semantic_base_url = "https://api.semanticscholar.org/graph/v1"
        self.semantic_api_key = semantic_api_key

    from urllib.parse import urlparse

    def get_pdf_url(self, paper_url: str) -> str:
        """
        Convert a paper HTML page URL to a direct PDF URL for known publishers.
        Supports:
        - PMLR Proceedings (proceedings.mlr.press)
        - arXiv (arxiv.org)
        - CVF Open Access (openaccess.thecvf.com)
        """
        if "proceedings.mlr.press" in paper_url:
            # Extract the volume and paper_id
            parts = paper_url.rstrip("/").split("/")
            volume = parts[-2]   # e.g., v119
            paper_id_html = parts[-1]  # e.g., wang20t.html
            paper_id = paper_id_html.replace(".html", "")
            return f"https://proceedings.mlr.press/{volume}/{paper_id}/{paper_id}.pdf"
        
        elif "arxiv.org/abs/" in paper_url:
            # Example: https://arxiv.org/abs/1706.01061 -> /pdf/1706.01061.pdf
            return paper_url.replace("abs", "pdf") + ".pdf"
        
        elif "openaccess.thecvf.com" in paper_url:
            # Example:
            # https://openaccess.thecvf.com/content_cvpr_2016/html/Cheng_...html
            # -> https://openaccess.thecvf.com/content_cvpr_2016/papers/Cheng_...pdf
            return paper_url.replace(".html", ".pdf").replace("/html/", "/papers/")
        
        else:
            return None

    # ---------------- SEMANTIC SCHOLAR ----------------
    def fetch_from_semantic_scholar(self, query: str, max_results=3) -> List[Dict]:
        headers = {}
        if self.semantic_api_key:
            headers["x-api-key"] = self.semantic_api_key
        
        url = f"{self.semantic_base_url}/paper/search"
        params = {
            "query": query,
            "limit": max_results,
            "fields": "title,url,abstract,authors,year,openAccessPdf"
        }

        try:
            res = requests.get(url, headers=headers, params=params, timeout=15)
            res.raise_for_status()
            data = res.json()
            results = []
            for paper in data.get("data", []):
                pdf_url = paper.get("openAccessPdf", {}).get("url")
                results.append({
                    "title": paper["title"],
                    "abstract": paper.get("abstract"),
                    "authors": [a["name"] for a in paper.get("authors", [])],
                    "year": paper.get("year"),
                    "url": paper.get("url"),
                    "pdf_url": pdf_url
                })
            return results
        except Exception as e:
            print(f"[Semantic Scholar Error] {e}")
            return []

    # ---------------- ARXIV ----------------
    def fetch_from_arxiv(self, query: str, max_results=3) -> List[Dict]:
        try:
            search = Search(query=query, max_results=max_results, sort_by=SortCriterion.Relevance)
            results = []
            for r in search.results():
                results.append({
                    "title": r.title,
                    "abstract": r.summary,
                    "authors": [a.name for a in r.authors],
                    "year": r.published.year,
                    "url": r.entry_id,
                    "pdf_url": r.pdf_url
                })
            return results
        except Exception as e:
            print(f"[arXiv Error] {e}")
            return []

    # ---------------- GOOGLE SCHOLAR ----------------
    def fetch_from_google_scholar(self, query: str, max_results=3) -> List[Dict]:
        try:
            search_query = scholarly.search_pubs(query)
            results = []
            for i, paper in enumerate(search_query):
                if i >= max_results:
                    break

                pdf_url = self.get_pdf_url(paper.get("pub_url", ""))
                if not pdf_url:  # Skip if PDF URL can't be determined
                    continue
                results.append({
                    "title": paper.get("bib", {}).get("title"),
                    "abstract": paper.get("bib", {}).get("abstract"),
                    "authors": paper.get("bib", {}).get("author"),
                    "year": paper.get("bib", {}).get("pub_year"),
                    "url": paper.get("pub_url"),
                    "pdf_url": pdf_url  # Needs manual check
                })

            
            return results
        except Exception as e:
            print(f"[Google Scholar Error] {e}")
            return []

    # ---------------- PDF TEXT EXTRACTION ----------------
    def _extract_text_from_pdf(self, pdf_url: str) -> str:
        try:
            res = requests.get(pdf_url, timeout=20)
            res.raise_for_status()
            with fitz.open(stream=res.content, filetype="pdf") as doc:
                return "\n".join(page.get_text() for page in doc)
        except Exception as e:
            print(f"[PDF Extraction Error] {e}")
            return ""

    # ---------------- MASTER FETCH METHOD ----------------
    def fetch_papers(self, query: str, max_results=3) -> List[Dict]:
        results = []

        # Try Semantic Scholar
        results.extend(self.fetch_from_semantic_scholar(query, max_results))
        # If insufficient, try arXiv
        # print(len(results))
        if len(results) < max_results:
            results.extend(self.fetch_from_arxiv(query, max_results))
        # print(len(results))
        
        # If still insufficient, try Google Scholar
        if len(results) < max_results:
            results.extend(self.fetch_from_google_scholar(query, max_results))


        # Remove duplicates based on title
        seen_titles = set()
        unique_results = []
        for paper in results:
            if paper["title"] and paper["title"].lower() not in seen_titles:
                seen_titles.add(paper["title"].lower())
                unique_results.append(paper)

        # Fetch PDF text if available
        for paper in unique_results:
            if paper.get("pdf_url"):
                paper["full_text"] = self._extract_text_from_pdf(paper["pdf_url"])
            else:
                paper["full_text"] = ""

        return unique_results

# ---------------- Example usage ----------------
if __name__ == "__main__":
    loader = MultiSourcePaperLoader(semantic_api_key=None)  # Add key if available
    papers = loader.fetch_papers("What are the loss functions used in CNN modelling for face detection?", max_results=10)

    for p in papers:
        print(f"Title: {p['title']}")
        print(f"Authors: {p['authors']}")
        print(f"Year: {p['year']}")
        print(f"URL: {p['url']}")
        print(f"Abstract: {p['abstract'][:200] if p['abstract'] else 'N/A'}\n")
        print(f"Full Text: {p['full_text'][:100]}")


4
Title: Face r-cnn
Authors: ['H Wang', 'Z Li', 'X Ji', 'Y Wang']
Year: 2017
URL: https://arxiv.org/abs/1706.01061
Abstract: property of the face detection task, we improve the Faster R-CNN framework in  loss function  called center loss [33], we design a new multi-task loss function in the Fast R-CNN model to

Full Text: Face R-CNN
Hao Wang
Zhifeng Li∗Xing Ji
Yitong Wang
Tencent AI Lab, China
{hawelwang,michaelzfli,deni
Title: Loss function search for face recognition
Authors: ['X Wang', 'S Wang', 'C Chi', 'S Zhang']
Year: 2020
URL: https://proceedings.mlr.press/v119/wang20t.html
Abstract: target of our loss function search is to maximize the model Mw’ the model Mw is obtained by  minimizing the following search  the performance of face identification and the ROC curves to

Full Text: Loss Function Search for Face Recognition
Xiaobo Wang * 1 Shuo Wang * 1 Cheng Chi 2 Shifeng Zhang 2 
Title: Person re-identification by multi-channel parts-based cnn with improved triplet loss function
A

PDF download failed: 404 Client Error: Not Found for url: https://arxiv.org/pdf/Innovative%20deep%20learning%20architectures%20for%20medical%20image%20diagnosis:%20a%20comprehensive%20review%20of%20convolutional,%20recurrent,%20and%20transformer%20models
Title: Innovative deep learning architectures for medical image diagnosis: a comprehensive review of convolutional, recurrent, and transformer models
Text length: 180 chars
