In [8]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [17]:
import requests
import csv
import re
import argparse
from typing import List, Dict, Optional
import sys
from transformers import pipeline

# Initialize LLM for entity recognition using a fully public NER model
llm = pipeline("ner", model="nlpaueb/legal-bert-base-uncased", aggregation_strategy="simple")  # Publicly available NER model

# Constants
PUBMED_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
DETAILS_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

def fetch_pubmed_ids(query: str) -> List[str]:
    """Fetch PubMed IDs based on a search query."""
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": 10  # Adjust as needed
    }
    response = requests.get(PUBMED_API_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        return data.get("esearchresult", {}).get("idlist", [])
    return []

def fetch_paper_details(pubmed_ids: List[str]) -> List[Dict[str, str]]:
    """Fetch details for given PubMed IDs."""
    params = {
        "db": "pubmed",
        "id": ",".join(pubmed_ids),
        "retmode": "json"
    }
    response = requests.get(DETAILS_API_URL, params=params)
    papers = []
    if response.status_code == 200:
        data = response.json()
        for paper_id in pubmed_ids:
            paper_info = data.get("result", {}).get(paper_id, {})
            papers.append({
                "PubmedID": paper_id,
                "Title": paper_info.get("title", "Unknown"),
                "Publication Date": paper_info.get("pubdate", "Unknown"),
                "Non-academic Author(s)": extract_non_academic_authors(paper_info.get("authors", [])),
                "Company Affiliation(s)": extract_company_affiliations_llm(paper_info.get("source", "")),
                "Corresponding Author Email": extract_email(paper_info.get("source", ""))
            })
    return papers

def extract_non_academic_authors(authors: List[Dict[str, str]]) -> str:
    """Identify non-academic authors based on heuristics."""
    non_academic_authors = [author["name"] for author in authors if not re.search(r"university|lab|institute", author.get("affiliation", ""), re.IGNORECASE)]
    return ", ".join(non_academic_authors)

def extract_company_affiliations_llm(source: str) -> str:
    """Use an LLM to extract company affiliations from text."""
    entities = [entity['word'] for entity in llm(source) if entity['entity_group'] == 'ORG' and entity['score'] > 0.8]  # Filtering based on confidence score
    return ", ".join(set(entities))

def extract_email(source: str) -> str:
    """Extract email address from source data."""
    email_match = re.search(r"[\w.-]+@[\w.-]+\.[a-z]{2,}", source)
    return email_match.group(0) if email_match else ""

def save_to_csv(papers: List[Dict[str, str]], filename: str):
    """Save research papers to a CSV file."""
    keys = ["PubmedID", "Title", "Publication Date", "Non-academic Author(s)", "Company Affiliation(s)", "Corresponding Author Email"]
    with open(filename, "w", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(papers)

def main():
    parser = argparse.ArgumentParser(description="Fetch research papers from PubMed.")
    parser.add_argument("query", type=str, help="Search query for PubMed.")
    parser.add_argument("-f", "--file", type=str, help="Filename to save results as CSV.")
    parser.add_argument("-d", "--debug", action="store_true", help="Enable debug mode.")
    args = parser.parse_args()

    pubmed_ids = fetch_pubmed_ids(args.query)
    if args.debug:
        print("Fetched PubMed IDs:", pubmed_ids)

    papers = fetch_paper_details(pubmed_ids)
    if args.file:
        save_to_csv(papers, args.file)
        print(f"Results saved to {args.file}")
    else:
        for paper in papers:
            print(paper)

if __name__ == "__main__":
    if "get_ipython" in globals():  # Detect if running in Jupyter/Colab
        sys.argv = ["fetch_pubmed_papers.py", "cancer research", "-f", "results.csv"]
    main()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Results saved to results.csv
