<a href="https://colab.research.google.com/github/Reshma721/pubmed-papers-fetcher/blob/main/papers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install biopython pandas rich




In [7]:
from Bio import Entrez
import pandas as pd
import time
from typing import List, Dict
import logging


In [10]:
# Install dependencies
!pip install biopython pandas rich

# Import necessary libraries
from Bio import Entrez
import pandas as pd
import time
from typing import List, Dict
import logging


# Set your email for PubMed API access
Entrez.email = "reshmauppar329@gmail.com"

# ✅ Logging configuration
def setup_logger(debug: bool = False):
    """Set up logging configuration."""
    logger = logging.getLogger("PubMed Logger")
    logger.setLevel(logging.DEBUG if debug else logging.INFO)

    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG if debug else logging.INFO)

    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)

    logger.addHandler(console_handler)
    return logger


# ✅ Fetch papers safely
def fetch_papers(query: str, logger: logging.Logger, max_results: int = 50) -> List[Dict]:
    """Fetch research papers from PubMed using the query."""
    logger.info(f"Fetching papers for query: {query}")

    try:
        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
        record = Entrez.read(handle)
        ids = record.get("IdList", [])

        if not ids:
            logger.warning("No papers found.")
            return []

        papers = []

        for pubmed_id in ids:
            handle = Entrez.efetch(db="pubmed", id=pubmed_id, retmode="xml")
            record = Entrez.read(handle)

            for article in record.get("PubmedArticle", []):
                article_data = article.get("MedlineCitation", {}).get("Article", {})

                # Safely access nested fields with .get() to avoid KeyError or IndexError
                authors = article_data.get("AuthorList", [])

                companies = [
                    a.get("AffiliationInfo", [{}])[0].get("Affiliation", "None")
                    for a in authors
                    if "pharma" in a.get("AffiliationInfo", [{}])[0].get("Affiliation", "").lower()
                ]

                papers.append({
                    "PubmedID": pubmed_id,
                    "Title": article_data.get("ArticleTitle", "N/A"),
                    "PublicationDate": article_data.get("ArticleDate", [{}])[0].get("Year", "N/A"),
                    "Authors": [f"{a.get('LastName', '')} {a.get('ForeName', '')}" for a in authors],
                    "CompanyAffiliations": ", ".join(companies) if companies else "None"
                })

            time.sleep(1)  # To avoid API rate limits

        logger.info(f"Fetched {len(papers)} papers.")
        return papers

    except Exception as e:
        logger.error(f"API Error: {e}")
        return []


# ✅ Filter non-academic papers
def filter_non_academic_papers(papers: List[Dict], logger: logging.Logger) -> List[Dict]:
    """Filter papers with pharmaceutical/biotech affiliations."""
    filtered = [paper for paper in papers if paper["CompanyAffiliations"] != "None"]
    logger.info(f"Filtered {len(filtered)} papers with company affiliations.")
    return filtered


# ✅ Save results as CSV (creates an empty CSV if no papers are found)
def save_as_csv(papers: List[Dict], filename: str, logger: logging.Logger):
    """Save papers to a CSV file."""
    if papers:
        df = pd.DataFrame(papers)
    else:
        # Create an empty CSV with headers if no papers are found
        df = pd.DataFrame(columns=["PubmedID", "Title", "PublicationDate", "Authors", "CompanyAffiliations"])

    df.to_csv(filename, index=False)
    logger.info(f"Results saved to {filename}")


# ✅ Main Execution
def main():
    query = input("Enter the PubMed query: ")
    output_file = "research_papers.csv"

    logger = setup_logger(debug=True)

    logger.info(f"Searching PubMed for: {query}")

    # Fetch and filter papers
    papers = fetch_papers(query, logger)
    filtered_papers = filter_non_academic_papers(papers, logger)

    # Save to CSV (creates an empty file if no results are found)
    save_as_csv(filtered_papers, output_file, logger)

    print(f"Results saved to: {output_file}")


# ✅ Run the program
main()


Enter the PubMed query: cancer AND therapy


2025-03-31 17:02:28,320 - INFO - Searching PubMed for: cancer AND therapy
2025-03-31 17:02:28,320 - INFO - Searching PubMed for: cancer AND therapy
2025-03-31 17:02:28,320 - INFO - Searching PubMed for: cancer AND therapy
2025-03-31 17:02:28,320 - INFO - Searching PubMed for: cancer AND therapy
INFO:PubMed Logger:Searching PubMed for: cancer AND therapy
2025-03-31 17:02:28,324 - INFO - Fetching papers for query: cancer AND therapy
2025-03-31 17:02:28,324 - INFO - Fetching papers for query: cancer AND therapy
2025-03-31 17:02:28,324 - INFO - Fetching papers for query: cancer AND therapy
2025-03-31 17:02:28,324 - INFO - Fetching papers for query: cancer AND therapy
INFO:PubMed Logger:Fetching papers for query: cancer AND therapy
2025-03-31 17:03:12,006 - ERROR - API Error: list index out of range
2025-03-31 17:03:12,006 - ERROR - API Error: list index out of range
2025-03-31 17:03:12,006 - ERROR - API Error: list index out of range
2025-03-31 17:03:12,006 - ERROR - API Error: list index 

Results saved to: research_papers.csv


In [11]:
from google.colab import files
files.download("research_papers.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# Clone your repository
!git clone https://github.com/Reshma721/pubmed-papers-fetcher.git

# Move into the cloned directory
%cd pubmed-papers-fetcher


Cloning into 'pubmed-papers-fetcher'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects:  33% (1/3)[Kremote: Counting objects:  66% (2/3)[Kremote: Counting objects: 100% (3/3)[Kremote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects:  50% (1/2)[Kremote: Compressing objects: 100% (2/2)[Kremote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects:  33% (1/3)Receiving objects:  66% (2/3)Receiving objects: 100% (3/3)Receiving objects: 100% (3/3), done.
/content/pubmed-papers-fetcher/pubmed-papers-fetcher


In [15]:
%pwd


'/content/pubmed-papers-fetcher/pubmed-papers-fetcher'

In [16]:
!git status


On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [19]:
!mv /path/to/your/code.py /content/pubmed-papers-fetcher/pubmed-papers-fetcher/




mv: cannot stat '/path/to/your/code.py': No such file or directory


In [23]:
!mv /content/papers.ipynb /content/pubmed-papers-fetcher/pubmed-papers-fetcher/


mv: cannot stat '/content/papers.ipynb': No such file or directory


In [22]:
!mv /content/papers.py /content/pubmed-papers-fetcher/pubmed-papers-fetcher/


mv: cannot stat '/content/papers.py': No such file or directory


In [None]:
from google.colab import files
uploaded = files.upload()
