In [32]:
from pypdf import PdfReader
import requests
from io import BytesIO
import serpapi
import os
from dotenv import load_dotenv
load_dotenv()

from langchain_core.documents import Document as LangchainDocument
from metapub import FindIt
import requests
import xml.etree.ElementTree as ET

from ftplib import FTP
from urllib.parse import urlparse
from io import BytesIO 

from langchain_community.retrievers import ArxivRetriever

import arxiv
import requests
from io import BytesIO
from pypdf import PdfReader
import re

from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.embeddings import HuggingFaceEmbeddings 
from transformers import AutoTokenizer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

import re
from typing import List, Dict, Tuple



## Extract text from refs

### get a pdf from the url

In [33]:
def parse_pdf_file(path:str) -> str:

    if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
        response = requests.get(path)
        response.raise_for_status()  # Ensure download succeeded
        reader = PdfReader(BytesIO(response.content))
    else:
        reader = PdfReader(path)

    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    
    return text


### arxiv

In [34]:
def get_paper_from_arxiv_id(doi: str):
    """
    Retrieve paper from arXiv using its arXiv ID.
    """
    client = arxiv.Client()
    search = arxiv.Search(query=doi, max_results=1)
    results = client.results(search)
    pdf_url = next(results).pdf_url
    text = parse_pdf_file(pdf_url)
    return text

arxiv_id = "2511.16868"
get_paper_from_arxiv_id(arxiv_id)

2025-11-29 23:37:06 MacBook-Air-3.local arxiv[49765] INFO Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=2511.16868&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-11-29 23:37:07 MacBook-Air-3.local arxiv[49765] INFO Got first page: 1 of 1 total results


'The Joint Gromov Wasserstein Objective for Multiple Object\nMatching\nAryan Tajmir Riahi1, and Khanh Dao Duc 1,2,∗\n1 Department of Computer Science, University of British Columbia, Vancouver, BC V6T 1Z4, Canada\n2 Department of Mathematics, University of British Columbia, Vancouver, BC V6T 1Z4, Canada\nAbstract\nThe Gromov-Wasserstein (GW) distance serves as a powerful tool for matching objects in metric\nspaces. However, its traditional formulation is constrained to pairwise matching between single objects,\nlimiting its utility in scenarios and applications requiring multiple-to-one or multiple-to-multiple object\nmatching. In this paper, we introduce the Joint Gromov-Wasserstein (JGW) objective and extend the\noriginal framework of GW to enable simultaneous matching between collections of objects. Our formu-\nlation provides a non-negative dissimilarity measure that identifies partially isomorphic distributions of\nmm-spaces, with point sampling convergence. We also show that the 

### arxiv langchain

In [35]:
def get_paper_from_arxiv_id_langchain(arxiv_id: str):
    """
    Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document
    """
    search = "2304.07814"
    retriever = ArxivRetriever(
        load_max_docs=2,
        get_full_documents=True,
    )
    docs = retriever.invoke(search)
    return docs

# get_paper_from_arxiv_id_langchain(arxiv_id)

### pmids

In [36]:
def get_paper_from_pmid(pmid:str):
    src = FindIt(pmid)
    if src.url:
        pdf_text = parse_pdf_file(src.url)
        return pdf_text
    else:
       print(src.reason)

pmid = "29641911"
get_paper_from_pmid(pmid)

'Sickle Cell Anemia and Its Phenotypes\nThomas N. Williams1,2, Swee Lay Thein3\nThomas N. Williams: tom.williams@imperial.ac.uk; Swee Lay Thein: sweelay.thein@nih.gov\n1Department of Epidemiology and Demography, KEMRI/Wellcome Trust Research Programme, \nKilifi, Kenya\n2Department of Medicine, Imperial College London, London W2 1NY , United Kingdom\n3Sickle Cell Branch, National Heart, Lung, and Blood Institute, National Institutes of Health, \nBethesda, Maryland 20892-1589, USA\nAbstract\nIn the 100 years since sickle cell anemia (SCA) was first described in the medical literature, \nstudies of its molecular and pathophysiological basis have been at the vanguard of scientific \ndiscovery. By contrast, the translation of such knowledge into treatments that improve the lives \nof those affected has been much too slow. Recent years, however, have seen major advances on \nseveral fronts. A more detailed understanding of the switch from fetal to adult hemoglobin and \nthe identification of

### pmcid

In [None]:

def download_pdf_via_ftp(url: str) -> bytes:
    """
    Download a PDF file from an FTP URL and return its content as bytes.
    """
    parsed_url = urlparse(url)
    ftp_host = parsed_url.netloc
    ftp_path = parsed_url.path

    file_buffer = BytesIO()

    with FTP(ftp_host) as ftp:
        ftp.login() 
        ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)
            
    file_buffer.getvalue()
    file_buffer.seek(0)
    return file_buffer

import xml.etree.ElementTree as ET

def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
    """
    Download and parse a PDF from PubMed using its PMID.
    """
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
    response = requests.get(url)
    cleaned_string = response.content.decode('utf-8').strip()
    try:
        root = ET.fromstring(cleaned_string)
        pdf_link_element = root.find(".//link[@format='pdf']")
        ftp_url = pdf_link_element.get('href')
        file_byte = download_pdf_via_ftp(ftp_url)

        reader = PdfReader(file_byte)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
        print(f"got {pmid} via ftp download")
        return text
    except Exception as e:
        print(e)

pmid = "PMC5334499"
parse_pdf_from_pubmed_pmid(pmid)

got PMC5334499 via ftp download


'Matteo Bauckneht, Roberta Piva, Gianmario Sambuceti, Francesco Grossi, Silvia Morbelli \nEDITORIAL\n27 February 28, 2017|Volume 9|Issue 2|WJR|www.wjgnet.com\nEvaluation of response to immune checkpoint inhibitors: Is \nthere a role for positron emission tomography?\nMatteo Bauckneht, Roberta Piva, Gianmario Sambuceti, \nSilvia Morbelli, Nuclear Medicine Unit, IRCCS San Martino-\nIST, University of Genoa, 16132 Genoa, Italy\nFrancesco Grossi, Lung Cancer Unit, IRCCS San Martino-IST, \nUniversity of Genoa, 16132 Genoa, Italy\nAuthor contributions:  Morbelli S conceived and designed \nthe study; Bauckneht M and Morbelli S drafted the manuscript; \nBauckneht M and Piva R prepared the tables and figures; \nSambuceti G and Grossi F critically revised the manuscript; all \nthe authors approved the final version of the paper.\nConflict-of-interest statement: The authors have no conflicts of \ninterest related to this publication to disclose.\nOpen-Access: This article is an open-access articl

### doi

In [38]:
def download_pdf_from_url(url):
    """
    Download and extract text from a PDF URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    content_type = response.headers.get('content-type', '').lower()
    if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
        raise Exception(f"URL did not return a PDF (got {content_type})")
    
    reader = PdfReader(BytesIO(response.content))
    text = ""
    for page in reader.pages:
        text += page.extract_text() #or ""
    return text

In [39]:

def download_paper_from_doi(doi):
    """
    Attempt to download paper from DOI with multiple fallback methods
    """
    # Clean DOI if it has prefix
    doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
    
    # Method 1: Try Unpaywall API (free, legal access)
    try:
        unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
        response = requests.get(unpaywall_url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
                pdf_url = data['best_oa_location']['url_for_pdf']
                text = download_pdf_from_url(pdf_url)
                print(f"Found PDF via Unpaywall: {pdf_url}")
                return text
    except Exception as e:
        print(f"Unpaywall failed: {e}")


doi = '10.1097'
download_paper_from_doi(doi)

### serpapi

In [40]:
def get_pdf_content_serpapi(doi: str) -> str:
    """
    Get the link to the paper from its DOI using SerpAPI Google Scholar search.
    """
    client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
    results = client.search({
        'engine': 'google_scholar',
        'q': doi,
    })

    pdf_path = results["organic_results"][0]["link"]
    pdf_text = parse_pdf_file(pdf_path)
    return pdf_text


get_pdf_content_serpapi(doi)



' \n \nResources  Guidelines   Chen VL, Morgan TR, Rotman Y, et al. Resmetirom therapy for metabolic dysfunction-associated steatotic liver disease: October 2024 updates to AASLD Practice Guidance. Hepatology. 2025;81:312-320. doi:10.1097/HEP.0000000000001112  Cusi K, Isaacs S, Barb D, et al. American Association of Clinical Endocrinology clinical practice guideline for the diagnosis and management of nonalcoholic fatty liver disease in primary care and endocrinology clinical settings: co-sponsored by the American Association for the Study of Liver Diseases (AASLD). Endocr Pract. 2022;28:528-562. doi:10.1016/j.eprac.2022.03.010  de Franchis R, Bosch J, Garcia-Tsao G, et al. Baveno VII - renewing consensus in portal hypertension. J Hepatol. 2022;76:959-974. doi:10.1016/j.jhep.2021.12.022  European Association for the Study of the Liver. EASL Clinical Practice Guidelines on non-invasive tests for evaluation of liver disease severity and prognosis - 2021 update. J Hepatol. 2021;75:659-689

## parsing the verbose input to extract papers ids

In [41]:
exemple_llm_output = """arxiv_id = "2511.16868"

pmid = "29641911"
pmid = "PMC5334499"
doi = '10.1097'"""

### initialize extractor

In [42]:



class ReferenceExtractor:
    """Extract and classify references from LLM outputs."""
    
    # Regex patterns for identification
    DOI_PATTERN = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
    DOI_LOOSE = r"10\.\d{4,9}(?:/[-._;()/:A-Za-z0-9]+)?"
    PMID_PATTERN = r"\b\d{7,8}\b"
    ARXIV_NEW = r"\b\d{4}\.\d{4,5}(?:v\d+)?\b"
    ARXIV_OLD = r"\b[a-z\-]+/\d{7}\b"
    PMCID_PATTERN = r"\bPMC\d+\b"
    
    def __init__(self):
        """Initialize the extractor with compiled regex patterns."""
        self.patterns = {
            'doi': re.compile(f"({self.DOI_PATTERN})|({self.DOI_LOOSE})", re.IGNORECASE),
            'pmid': re.compile(self.PMID_PATTERN),
            'arxiv': re.compile(f"({self.ARXIV_NEW})|({self.ARXIV_OLD})", re.IGNORECASE),
            'pmcid': re.compile(self.PMCID_PATTERN, re.IGNORECASE)
        }
    
    def extract_references(self, text: str) -> List[Tuple[str, str]]:
        """
        Extract all references from text and classify them.
        
        Args:
            text: Input string that may contain references in various formats
            
        Returns:
            List of tuples: (reference_value, reference_type)
        """
        references = []
        seen = set()
        
        # First, try to parse as a list-like string
        list_refs = self._extract_from_list_format(text)
        if list_refs:
            for ref in list_refs:
                ref_type = self._classify_single_ref(ref)
                if ref not in seen:
                    references.append((ref, ref_type))
                    seen.add(ref)
            return references
        
        # If not a list format, extract using regex patterns
        for ref_type, pattern in self.patterns.items():
            matches = pattern.finditer(text)
            for match in matches:
                ref_value = match.group(0).strip()
                if ref_value not in seen:
                    references.append((ref_value, ref_type))
                    seen.add(ref_value)
        
        return references
    
    def _extract_from_list_format(self, text: str) -> List[str]:
        """
        Extract references from list-like formats.
        Handles: "id1,id2,id3" and '["id1","id2"]' and "['id1', 'id2']"
        """
        text = text.strip()
        
        # Try parsing as Python list string
        if text.startswith('[') and text.endswith(']'):
            try:
                # Remove brackets and quotes, split by comma
                cleaned = text[1:-1]
                # Handle both single and double quotes
                items = re.findall(r'["\']([^"\']+)["\']', cleaned)
                if items:
                    return [item.strip() for item in items]
            except:
                pass
        
        # Try comma-separated format (no brackets)
        if ',' in text and not any(char in text for char in ['\n', '(', ')']):
            # Check if it looks like a simple list
            if text.count(',') >= 1 and len(text) < 200:
                items = [item.strip().strip('"\'') for item in text.split(',')]
                # Filter out empty strings
                return [item for item in items if item]
        
        return []
    
    def _classify_single_ref(self, ref: str) -> str:
        """Classify a single extracted reference string."""
        ref = ref.strip().strip('"\'')
        
        # Check each pattern in priority order
        if re.match(r"10\.\d{4,9}(?:/[-._;()/:A-Za-z0-9]+)?", ref, re.IGNORECASE):
            return "doi"
                 
        if re.match(r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+", ref, re.IGNORECASE):
            return "doi"
        
        if re.match(r"^PMC\d+$", ref, re.IGNORECASE):
            return "pmcid"
        
        if re.match(r"^\d{4}\.\d{4,5}(?:v\d+)?$", ref):
            return "arxiv"
        
        if re.match(r"^[a-z\-]+/\d{7}$", ref, re.IGNORECASE):
            return "arxiv"
        
        if re.match(r"^\d{7,8}$", ref):
            return "pmid"
        
        return "unknown"

In [43]:
extractor = ReferenceExtractor()

extractor.extract_references(exemple_llm_output)

[('10.1097', 'doi'),
 ('29641911', 'pmid'),
 ('2511.16868', 'arxiv'),
 ('PMC5334499', 'pmcid')]

### initialse router

In [44]:

def process_ref(extr_ref:tuple[str,str]) -> str:
    if extr_ref[1] == "arxiv":
        return get_paper_from_arxiv_id(extr_ref[0])
    elif extr_ref[1] == "pmid":
        for tool in [get_paper_from_pmid, parse_pdf_from_pubmed_pmid]:
            try:
                return tool(extr_ref[0])
            except:
                continue
    elif extr_ref[1] == "doi":
        for tool in [get_pdf_content_serpapi, download_paper_from_doi]:
            try:
                return tool(extr_ref[0])
            except:
                continue
    elif extr_ref[1] == "pmcid":
        return parse_pdf_from_pubmed_pmid(extr_ref[0])
           

In [45]:
# fetch docs
extractor = ReferenceExtractor()
REFS = extractor.extract_references(exemple_llm_output) # Change here the type of IDs to DEBUG
raw_docs=[]

for ref in tqdm(REFS):
    if ref[0] not in set(["existing_reference"]):
        print(ref[0])
        text = process_ref(ref)
        if text:
            raw_docs.append(LangchainDocument(page_content=text,metadata={'source':ref[0]}))
    
recover_yield = f" *** -> {round(100*len(raw_docs)/len(REFS))}% papers downloaded"
print(recover_yield)

  0%|          | 0/4 [00:00<?, ?it/s]

10.1097


 25%|██▌       | 1/4 [00:01<00:03,  1.10s/it]

29641911


 50%|█████     | 2/4 [00:02<00:02,  1.27s/it]2025-11-29 23:37:16 MacBook-Air-3.local arxiv[49765] INFO Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=2511.16868&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-11-29 23:37:16 MacBook-Air-3.local arxiv[49765] INFO Got first page: 1 of 1 total results


2511.16868


 75%|███████▌  | 3/4 [00:03<00:01,  1.13s/it]

PMC5334499


100%|██████████| 4/4 [00:06<00:00,  1.58s/it]

got PMC5334499 via ftp download
 *** -> 100% papers downloaded





## embeddings

In [46]:
VECTOR_DB_PATH='quantum_vector1'

In [47]:
cgkvjh

NameError: name 'cgkvjh' is not defined

### load if exists

In [48]:

from langchain_community.vectorstores import FAISS

# define embedding
embedding_name="BAAI/bge-large-en-v1.5"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_name,
                                    model_kwargs={"device": "mps"},
                                    encode_kwargs={"normalize_embeddings": True,},)
try:
    # Load the vector database from the folder
    print(f"try to load vector store from {VECTOR_DB_PATH}")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
        VECTOR_DB_PATH, 
        embedding_model, 
        allow_dangerous_deserialization=True  # Required for security in newer LangChain versions
    )
    existing_reference = [doc.metadata.get("source") for doc in KNOWLEDGE_VECTOR_DATABASE.docstore._dict.values()]
    print("vectro store loaded")
except Exception as e :
    print("FAISS load error:", e)
    KNOWLEDGE_VECTOR_DATABASE = None
    existing_reference = []
    print("no vector store found, creating a new one...")
    

2025-11-29 23:38:15 MacBook-Air-3.local sentence_transformers.SentenceTransformer[49765] INFO Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5


In [50]:
raw_docs

[Document(metadata={'source': '10.1097'}, page_content=' \n \nResources  Guidelines   Chen VL, Morgan TR, Rotman Y, et al. Resmetirom therapy for metabolic dysfunction-associated steatotic liver disease: October 2024 updates to AASLD Practice Guidance. Hepatology. 2025;81:312-320. doi:10.1097/HEP.0000000000001112  Cusi K, Isaacs S, Barb D, et al. American Association of Clinical Endocrinology clinical practice guideline for the diagnosis and management of nonalcoholic fatty liver disease in primary care and endocrinology clinical settings: co-sponsored by the American Association for the Study of Liver Diseases (AASLD). Endocr Pract. 2022;28:528-562. doi:10.1016/j.eprac.2022.03.010  de Franchis R, Bosch J, Garcia-Tsao G, et al. Baveno VII - renewing consensus in portal hypertension. J Hepatol. 2022;76:959-974. doi:10.1016/j.jhep.2021.12.022  European Association for the Study of the Liver. EASL Clinical Practice Guidelines on non-invasive tests for evaluation of liver disease severity 

In [None]:

from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.embeddings import HuggingFaceEmbeddings 
from transformers import AutoTokenizer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm



# split texts into chunks
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                            AutoTokenizer.from_pretrained(embedding_name),
                            chunk_size=300,
                            chunk_overlap=int(300 / 10),
                            add_start_index=True,
                            strip_whitespace=True,
                            separators=["\n\n", "\n", ". ", "! ", "? ", ", ", " ", ""])
  
docs_processed = text_splitter.split_documents(raw_docs)                          
NEW_KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE)

if raw_docs:
    docs_processed = text_splitter.split_documents(raw_docs)
    print("creating the vector store...")

    # create the vector store
    NEW_KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE)

    if KNOWLEDGE_VECTOR_DATABASE :
        print("merge vector store")
        KNOWLEDGE_VECTOR_DATABASE.merge_from(NEW_KNOWLEDGE_VECTOR_DATABASE)
        KNOWLEDGE_VECTOR_DATABASE.save_local(VECTOR_DB_PATH)
    else:
        NEW_KNOWLEDGE_VECTOR_DATABASE.save_local(VECTOR_DB_PATH)



Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


In [2]:
from tool_create_FAISS_vector import create_vector_store_from_list_of_doi

exemple_llm_output = """arxiv_id = "2501.16868"

pmid = "29641991"
pmid = "PMC5034499"
doi = '10.1007'"""

create_vector_store_from_list_of_doi(exemple_llm_output, 'oink')

2025-11-29 23:46:44 MacBook-Air-3.local sentence_transformers.SentenceTransformer[50946] INFO Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5


try to load vector store from oink
vectro store loaded


 50%|█████     | 2/4 [00:07<00:06,  3.47s/it]2025-11-29 23:46:58 MacBook-Air-3.local arxiv[50946] INFO Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=2501.16868&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2025-11-29 23:46:58 MacBook-Air-3.local arxiv[50946] INFO Got first page: 1 of 1 total results
100%|██████████| 4/4 [00:13<00:00,  3.32s/it]

got PMC5034499 via ftp download
 *** -> 75% papers downloaded





creating the vector store...
merge vector store


'oink'