In [60]:
import requests
import csv
from xml.etree import ElementTree

# Function to fetch papers from PubMed based on the search query
def fetch_papers(query: str):
    """Fetch papers from PubMed based on a search query."""
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': query,  # Search term
        'retmax': 20,  # Number of papers to fetch
        'retmode': 'xml'  # We want the data in XML format
    }
    
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        print("Data fetched successfully!")
        return response.text  # Return the XML response text
    else:
        print(f"Error: {response.status_code}")
        return None

# Function to extract PubMed IDs from the fetched data
def extract_pubmed_ids(xml_data: str):
    """Extract PubMed IDs from the XML response."""
    tree = ElementTree.fromstring(xml_data)
    ids = [id_elem.text for id_elem in tree.findall(".//Id")]
    return ids

# Function to fetch detailed information about a specific paper using its PubMed ID
def fetch_paper_details(pubmed_id: str):
    """Fetch detailed information about a paper using its PubMed ID."""
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pubmed_id,  # PubMed ID of the paper
        'retmode': 'xml'  # We want the data in XML format
    }
    
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        print(f"Details fetched for PubMed ID: {pubmed_id}")
        return response.text
    else:
        print(f"Error fetching details for PubMed ID {pubmed_id}: {response.status_code}")
        return None

# Function to parse the detailed paper data and extract necessary information
def parse_paper_data(xml_data: str):
    """Parse the XML data and extract relevant information."""
    tree = ElementTree.fromstring(xml_data)
    
    title = tree.find(".//Item[@Name='Title']").text if tree.find(".//Item[@Name='Title']") is not None else "N/A"
    pub_date = tree.find(".//PubDate").text if tree.find(".//PubDate") is not None else "N/A"
    
    authors = []
    affiliations = []
    corresponding_email = "N/A"
    
    for author in tree.findall(".//Author"):
        name = author.find("Name").text if author.find("Name") is not None else "N/A"
        email = author.find("Affiliation").text if author.find("Affiliation") is not None else "N/A"
        authors.append(name)
        affiliations.append(email)
        
        if author.find("Email") is not None:
            corresponding_email = author.find("Email").text
    
    return {
        'Title': title,
        'PubmedID': tree.find(".//PubmedData//ArticleId").text if tree.find(".//PubmedData//ArticleId") is not None else "N/A",
        'Publication Date': pub_date,
        'Authors': authors,
        'Affiliations': affiliations,
        'Corresponding Author Email': corresponding_email
    }

# Function to filter authors affiliated with pharmaceutical/biotech companies
def filter_by_affiliation(data):
    """Filter out authors affiliated with pharmaceutical/biotech companies."""
    company_keywords = ['pharma', 'biotech', 'Pfizer', 'Moderna', 'Johnson & Johnson', 'AstraZeneca', 'Bayer', 'Novartis', 'Sanofi']
    
    filtered_data = []
    for entry in data:
        authors = entry['Authors']
        affiliations = entry['Affiliations']
        
        non_academic_authors = []
        companies = []
        
        for i, affiliation in enumerate(affiliations):
            if any(keyword.lower() in affiliation.lower() for keyword in company_keywords):
                non_academic_authors.append(authors[i])
                companies.append(affiliation)
        
        if non_academic_authors:
            filtered_data.append({
                'PubMedID': entry['PubmedID'],
                'Title': entry['Title'],
                'Publication Date': entry['Publication Date'],
                'Non-academic Author(s)': ", ".join(non_academic_authors),
                'Company Affiliation(s)': ", ".join(companies),
                'Corresponding Author Email': entry['Corresponding Author Email']
            })
    
    return filtered_data

# Function to save filtered data to a CSV file
def save_to_csv(data):
    """Save the filtered data to a CSV file."""
    with open('papers.csv', 'w', newline='') as csvfile:
        fieldnames = ['PubMedID', 'Title', 'Publication Date', 'Non-academic Author(s)', 'Company Affiliation(s)', 'Corresponding Author Email']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        writer.writerows(data)
        print("Filtered data saved to papers.csv.")

# Main function to handle user input and execution
def main():
    """Main function to handle user input and fetch papers."""
    # Manually set your search query here for testing
    query = "pharmaceutical research"
    
    # Fetch papers based on the query
    xml_data = fetch_papers(query)
    
    if xml_data:
        # Extract PubMed IDs from the fetched data
        pubmed_ids = extract_pubmed_ids(xml_data)
        
        # Initialize a list to store paper details
        papers_data = []
        
        # Fetch detailed information for each paper using its PubMed ID
        for pubmed_id in pubmed_ids:
            paper_xml = fetch_paper_details(pubmed_id)
            
            if paper_xml:
                paper_data = parse_paper_data(paper_xml)
                papers_data.append(paper_data)
        
        # Filter papers by author affiliation with pharmaceutical/biotech companies
        filtered_data = filter_by_affiliation(papers_data)
        
        # Save filtered data to a CSV file
        save_to_csv(filtered_data)

# Run the main function directly
main()

Data fetched successfully!
Details fetched for PubMed ID: 40080018
Details fetched for PubMed ID: 40079843
Details fetched for PubMed ID: 40079842
Details fetched for PubMed ID: 40079787
Details fetched for PubMed ID: 40079709
Details fetched for PubMed ID: 40079707
Details fetched for PubMed ID: 40079695
Details fetched for PubMed ID: 40079557
Details fetched for PubMed ID: 40079429
Details fetched for PubMed ID: 40079412
Details fetched for PubMed ID: 40079395
Details fetched for PubMed ID: 40079266
Details fetched for PubMed ID: 40079210
Details fetched for PubMed ID: 40079180
Details fetched for PubMed ID: 40079164
Details fetched for PubMed ID: 40079157
Details fetched for PubMed ID: 40079137
Details fetched for PubMed ID: 40079074
Details fetched for PubMed ID: 40079018
Details fetched for PubMed ID: 40078968


PermissionError: [Errno 13] Permission denied: 'papers.csv'

In [None]:
import os
print("Current Working Directory:", os.getcwd())

In [None]:
%%writefile README.md
# Get Papers List

This Python program fetches research papers from PubMed based on a user query. It filters the papers based on whether at least one author is affiliated with a pharmaceutical or biotech company, and returns the results in a CSV file.

## Features

- Fetch research papers from PubMed API.
- Filter results to identify authors affiliated with pharmaceutical or biotech companies.
- Output the filtered results to a CSV file with the following columns:
  - PubmedID: Unique identifier for the paper.
  - Title: Title of the paper.
  - Publication Date: Date the paper was published.
  - Non-academic Author(s): Names of authors affiliated with non-academic institutions.
  - Company Affiliation(s): Names of pharmaceutical/biotech companies.
  - Corresponding Author Email: Email address of the corresponding author.

## Requirements

- Python 3.13 or higher
- Poetry (for dependency management)
- Dependencies will be automatically installed via `poetry install`.

## Setup Instructions

1. Clone this repository to your local machine.
   
   ```bash
   git clone https://github.com/your-username/get-papers-list.git
   cd get-papers-list

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def search_papers(query):
    print(f"Searching for: {query}")
    url = f"https://api.ncbi.nlm.nih.gov/eutils/esearch.fcgi?db=pubmed&term={query}&retmax=10"
    response = requests.get(url)

    if response.status_code == 200:
        print("API call successful.")
        print("Response:", response.text)  # Print the raw response to check XML structure

        # Parse the XML response
        root = ET.fromstring(response.text)

        # Extract the list of IDs
        id_list = root.find('IdList').findall('Id')
        paper_ids = [id_elem.text for id_elem in id_list]
        
        # Fetch details for each paper using the IDs (if needed)
        return paper_ids
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return []

def save_papers_to_csv(paper_ids, filename):
    if not paper_ids:
        print("No papers to save.")
        return

    # Prepare mock data (or you can fetch detailed info using the paper_ids if necessary)
    papers_details = [
        {'Title': f'Paper {i+1}', 'Authors': 'Author A, Author B', 'Source': 'Journal XYZ', 'Year': '2021'}
        for i in range(len(paper_ids))
    ]

    try:
        df = pd.DataFrame(papers_details)
        print(f"Data to be saved:\n{df}")  # Debug print to check if data is correct
        df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")
    except Exception as e:
        print(f"Error saving the CSV: {str(e)}")

def main(query, filename):
    print(f"Fetching papers for query: {query}")
    paper_ids = search_papers(query)
    save_papers_to_csv(paper_ids, filename)

# Running the function with a test query and output filename
query = "cancer research"
output_filename = "C:/Users/Admin/Dropbox/My PC (LAPTOP-D6DN0OTQ)/Desktop/papers.csv"  # Save to Desktop
main(query=query, filename=output_filename)

In [None]:
import requests

def search_papers(query):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': query,
        'retmax': 10,  # Number of results to fetch
        'retmode': 'xml'  # Change to 'json' if the response is expected in JSON format
    }

    # Make the API request
    response = requests.get(url, params=params)

    print("API call successful.")
    print("Response:", response.text)  # Print raw response to inspect it

    try:
        # If the response is in XML format, you might need to parse it as XML instead of JSON
        if response.status_code == 200:
            return response.text  # Returning raw XML if in XML format
        else:
            return None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
import xml.etree.ElementTree as ET

def parse_papers_from_xml(xml_data):
    try:
        # Parse the XML data
        root = ET.fromstring(xml_data)

        # Extract paper IDs from the XML response
        paper_ids = root.findall('.//Id')  # Find all the <Id> elements
        paper_ids = [id_elem.text for id_elem in paper_ids]

        print(f"Found {len(paper_ids)} papers.")
        return paper_ids
    except Exception as e:
        print(f"Error parsing XML: {e}")
        return None

In [None]:
def main(query, filename):
    print(f"Fetching papers for query: {query}")
    raw_data = search_papers(query)  # Get the raw response
    
    if raw_data:
        # Parse XML if needed
        paper_ids = parse_papers_from_xml(raw_data)
        
        if paper_ids:
            save_papers_to_csv(paper_ids, filename)  # Save the data to CSV
        else:
            print("No papers found.")
    else:
        print("Failed to fetch data.")

# Running the function with the query
query = "cancer research"
output_filename = "C:/Users/Admin/Dropbox/My PC (LAPTOP-D6DN0OTQ)/Desktop/papers.csv"  # Adjust path if needed
main(query=query, filename=output_filename)

In [None]:
def main(query, filename):
    print(f"Fetching papers for query: {query}")
    raw_data = search_papers(query)  # Get the raw response
    
    if raw_data:
        # Parse XML if needed
        paper_ids = parse_papers_from_xml(raw_data)
        
        if paper_ids:
            save_papers_to_csv(paper_ids, filename)  # Save the data to CSV
        else:
            print("No papers found.")
    else:
        print("Failed to fetch data.")

# Running the function with the query
query = "cancer research"
output_filename = "C:/Users/Admin/Dropbox/My PC (LAPTOP-D6DN0OTQ)/Desktop/papers.csv"  # Adjust path if needed
main(query=query, filename=output_filename)

In [None]:
import requests
import xml.etree.ElementTree as ET

def fetch_paper_details(paper_id):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': paper_id,
        'retmode': 'xml',  # Get the data in XML format
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        # Parse the XML response
        root = ET.fromstring(response.text)
        
        # Extract paper details
        try:
            title = root.find('.//ArticleTitle').text if root.find('.//ArticleTitle') else 'N/A'
            authors = ", ".join([author.text for author in root.findall('.//Author/LastName')]) if root.findall('.//Author') else 'N/A'
            source = root.find('.//Source').text if root.find('.//Source') else 'N/A'
            year = root.find('.//PubDate/Year').text if root.find('.//PubDate/Year') else 'N/A'
            
            return {
                "Title": title,
                "Authors": authors,
                "Source": source,
                "Year": year
            }
        except Exception as e:
            print(f"Error extracting paper details: {e}")
            return None
    else:
        print(f"Failed to fetch details for paper ID: {paper_id}")
        return None

In [None]:
import pandas as pd

def save_papers_to_csv(paper_ids, filename):
    papers_details = []

    # Fetch details for each paper and store them
    for paper_id in paper_ids:
        print(f"Fetching details for paper ID: {paper_id}")
        details = fetch_paper_details(paper_id)
        
        if details:
            papers_details.append(details)
    
    # Convert the list of paper details into a DataFrame and save to CSV
    if papers_details:
        df = pd.DataFrame(papers_details)
        df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")
    else:
        print("No paper details found to save.")

# Running the final main function
def main(query, filename):
    print(f"Fetching papers for query: {query}")
    raw_data = search_papers(query)  # Get the raw response
    
    if raw_data:
        # Parse XML if needed
        paper_ids = parse_papers_from_xml(raw_data)
        
        if paper_ids:
            save_papers_to_csv(paper_ids, filename)  # Save the data to CSV
        else:
            print("No papers found.")
    else:
        print("Failed to fetch data.")

# Running the function with the query
query = "cancer research"
output_filename = "C:/Users/Admin/Dropbox/My PC (LAPTOP-D6DN0OTQ)/Desktop/papers.csv"  # Adjust path if needed
main(query=query, filename=output_filename)

In [None]:
import pandas as pd

def save_papers_to_csv(papers, filename):
    # If papers contain any data, proceed to save
    if papers:
        df = pd.DataFrame(papers)
        try:
            df.to_csv(filename, index=False)  # Save data to CSV
            print(f"Results saved to {filename}")
        except Exception as e:
            print(f"Error saving the CSV: {e}")
    else:
        print("No papers found to save.")

In [None]:
papers = [
    {"Title": "Paper 1", "Authors": "Author A, Author B", "Source": "Journal XYZ", "Year": 2021},
    {"Title": "Paper 2", "Authors": "Author A, Author B", "Source": "Journal XYZ", "Year": 2021}
    # Add more papers...
]

In [None]:
query = "cancer research"
output_filename = "C:/Users/Admin/Dropbox/My PC (LAPTOP-D6DN0OTQ)/Desktop/papers.csv"
main(query=query, filename=output_filename)  # Run your main function to fetch and save the papers

In [None]:
import os
import pandas as pd

# Example data (replace this with your actual data)
data = {
    'Title': ['Paper 1', 'Paper 2', 'Paper 3'],
    'Authors': ['Author A, Author B', 'Author A, Author B', 'Author A, Author B'],
    'Source': ['Journal XYZ', 'Journal XYZ', 'Journal XYZ'],
    'Year': [2021, 2021, 2021]
}

# Create DataFrame from the data
df = pd.DataFrame(data)

# Define the folder and filename
output_directory = "C:/Users/Admin/Dropbox/My PC (LAPTOP-D6DN0OTQ)\Desktop"
output_filename = "papers.csv"
output_path = os.path.join(output_directory, output_filename)

# Ensure the directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Now save the file
try:
    df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")
except Exception as e:
    print(f"Error saving the file: {e}")