<a href="https://colab.research.google.com/github/TechWhizGenius/Teja_INFO5731_Fall2024/blob/main/TERM_PROJECT/Scraping_PubMed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests xmltodict



In [None]:
import requests
import xmltodict
import pandas as pd
import time

# Base URL for PubMed's E-Utilities API
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Define the search parameters
def pubmed_search(query, batch_size=100):
    all_pmids = []
    start = 0

    while True:
        search_params = {
            "db": "pubmed",              # PubMed database
            "term": query,               # Search query
            "retmax": batch_size,        # Number of results per request
            "retstart": start,           # Starting point for each batch
            "retmode": "xml"             # Return results in XML format
        }

        # Make the search request
        response = requests.get(base_url, params=search_params)

        if response.status_code == 200:
            # Parse the XML response to extract the PubMed IDs (PMIDs)
            data = xmltodict.parse(response.content)
            id_list = data.get('eSearchResult', {}).get('IdList')

            # If IdList exists, extract Ids; otherwise, break if None or empty
            if id_list:
                pmids = id_list.get('Id', [])
                if isinstance(pmids, list):
                    all_pmids.extend(pmids)
                else:
                    all_pmids.append(pmids)
            else:
                break  # Stop the loop if no more IDs are found

            # Increment the start for the next batch
            start += batch_size

            # Add a delay to avoid overloading the server
            time.sleep(0.4)
        else:
            print(f"Failed to retrieve data: {response.status_code}")
            break

    return all_pmids

# Function to fetch paper details using PubMed IDs
# Function to fetch paper details using PubMed IDs
def fetch_paper_details(pmids):
    articles = []

    for pmid in pmids:
        fetch_params = {
            "db": "pubmed",
            "id": pmid,            # PubMed ID
            "retmode": "xml"
        }

        # Make the fetch request
        response = requests.get(fetch_url, params=fetch_params)

        if response.status_code == 200:
            data = xmltodict.parse(response.content)
            # Check if 'PubmedArticleSet' and 'PubmedArticle' are present
            if 'PubmedArticleSet' in data and 'PubmedArticle' in data['PubmedArticleSet']:
                article = data['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']

                # Extract the relevant information
                title = article.get('ArticleTitle', 'N/A')
                abstract = article.get('Abstract', {}).get('AbstractText', 'N/A')
                journal = article.get('Journal', {}).get('Title', 'N/A')
                year = article.get('Journal', {}).get('JournalIssue', {}).get('PubDate', {}).get('Year', 'N/A')
                authors_list = article.get('AuthorList', {}).get('Author', [])

                # Handle cases where authors_list may not be a list
                if isinstance(authors_list, list):
                    authors = ', '.join([f"{a['LastName']} {a.get('ForeName', '')}" for a in authors_list if 'LastName' in a])
                else:
                    authors = f"{authors_list.get('LastName', 'N/A')} {authors_list.get('ForeName', '')}"

                # Handle abstract parsing (if it's a list of dictionaries)
                if isinstance(abstract, list):
                    abstract_text = ' '.join([a['#text'] if '#text' in a else str(a) for a in abstract])
                else:
                    abstract_text = abstract

                # Store the article details
                articles.append({
                    "Title": title,
                    "Authors": authors,
                    "Abstract": abstract_text,
                    "Year": year,
                    "Journal": journal,
                    "PMID": pmid,
                    "URL": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
                })
            else:
                print(f"No 'PubmedArticle' found for PMID {pmid}")
        else:
            print(f"Failed to fetch details for PMID {pmid}: {response.status_code}")

        # Add a delay to avoid rate limiting
        time.sleep(0.4)

    return articles


# Search for papers related to Autonomous Vehicles
query = "Autonomous Car Ethics"
pmids = pubmed_search(query)  # Fetch all PMIDs related to the query

# Fetch the details of the retrieved papers
papers = fetch_paper_details(pmids)

# Convert to a DataFrame
df = pd.DataFrame(papers)

# Save the data to a CSV file
df.to_csv("pubmed_Autonomous_Car_Ethics_papers.csv", index=False)

print("Data scraped and saved to 'pubmed_Autonomous_Car_Ethics_papers.csv'.")


Failed to retrieve data: 500
Data scraped and saved to 'pubmed_Autonomous_Car_Ethics_papers.csv'.
