<a href="https://colab.research.google.com/github/SANJIVANI2411/Automation_Project/blob/main/Fetech_Research_Paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import csv
import logging
import xml.etree.ElementTree as ET
import sys
from typing import List, Dict, Any


logging.basicConfig(level=logging.INFO)


PUBMED_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
FETCH_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

def fetch_paper_ids(query: str) -> List[str]:
    """Fetch paper IDs from PubMed based on the query."""
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": 10
    }
    response = requests.get(PUBMED_API_URL, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get("esearchresult", {}).get("idlist", [])

def fetch_paper_details(paper_ids: List[str]) -> str:
    """Fetch details of papers given their PubMed IDs."""
    if not paper_ids:
        return ""
    params = {
        "db": "pubmed",
        "id": ",".join(paper_ids),
        "retmode": "xml"
    }
    response = requests.get(FETCH_API_URL, params=params)
    response.raise_for_status()
    return response.text

def extract_relevant_data(xml_data: str) -> List[Dict[str, Any]]:
    """Extract required fields from PubMed XML response."""
    papers = []
    root = ET.fromstring(xml_data)
    for article in root.findall(".//PubmedArticle"):
        pubmed_id = article.find(".//PMID").text if article.find(".//PMID") is not None else ""
        title = article.find(".//ArticleTitle").text if article.find(".//ArticleTitle") is not None else ""
        pub_date = article.find(".//PubDate/Year").text if article.find(".//PubDate/Year") is not None else ""
        authors = []
        for author in article.findall(".//Author"):
            last_name = author.find(".//LastName")
            fore_name = author.find(".//ForeName")
            name = " ".join(filter(None, [last_name.text if last_name is not None else "", fore_name.text if fore_name is not None else ""]))
            affiliation = author.find(".//AffiliationInfo/Affiliation")
            authors.append({"name": name, "affiliation": affiliation.text if affiliation is not None else ""})
        papers.append({
            "pubmed_id": pubmed_id,
            "title": title,
            "publication_date": pub_date,
            "authors": authors
        })
    return papers

def filter_non_academic_authors(papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Filter papers where at least one author is affiliated with a company."""
    pharma_keywords = ["pharma", "biotech", "inc", "ltd", "corporation", "company"]
    filtered_papers = []
    for paper in papers:
        non_academic_authors = []
        company_affiliations = []
        for author in paper.get("authors", []):
            affiliation = author.get("affiliation", "").lower()
            if any(keyword in affiliation for keyword in pharma_keywords):
                non_academic_authors.append(author["name"])
                company_affiliations.append(affiliation)
        if non_academic_authors:
            paper["non_academic_authors"] = non_academic_authors
            paper["company_affiliations"] = company_affiliations
            filtered_papers.append(paper)
    return filtered_papers

def save_to_csv(papers: List[Dict[str, Any]], filename: str):
    """Save the filtered papers to a CSV file."""
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["PubmedID", "Title", "Publication Date", "Non-academic Authors", "Company Affiliations"])
        for paper in papers:
            writer.writerow([
                paper.get("pubmed_id", ""),
                paper.get("title", ""),
                paper.get("publication_date", ""),
                "; ".join(paper.get("non_academic_authors", [])),
                "; ".join(paper.get("company_affiliations", []))
            ])
    logging.info(f"Results saved to {filename}")


query = "cancer research"
filename = "results.csv"
debug_mode = False

if debug_mode:
    logging.getLogger().setLevel(logging.DEBUG)

logging.info("Fetching papers...")
paper_ids = fetch_paper_ids(query)
xml_data = fetch_paper_details(paper_ids)
papers = extract_relevant_data(xml_data)
filtered_papers = filter_non_academic_authors(papers)

if filename:
    save_to_csv(filtered_papers, filename)
    print(f"Results saved to {filename}")
else:
    for paper in filtered_papers:
        print(paper)


Results saved to results.csv


In [2]:
from google.colab import files
files.download("results.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>