In [11]:
pip install biopython




In [None]:
from Bio import Entrez

# Set your email here (NCBI requires it for API access)
Entrez.email = "your_email@example.com"

# Function to fetch metadata including title, abstract, and keywords
def fetch_pubmed_metadata(query, retmax=5):
    # Search PubMed for articles related to the query
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
    record = Entrez.read(handle)
    pubmed_ids = record['IdList']

    # Fetch metadata for each PubMed ID
    for pubmed_id in pubmed_ids:
        fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
        data = fetch_handle.read()

        print(f"PubMed ID: {pubmed_id}")
        print(data)
        print('-' * 80)

# Example: Fetch metadata for articles related to 'cancer'
fetch_pubmed_metadata("cancer")


In [None]:
from Bio import Entrez, Medline

Entrez.email = "your_email@example.com"

def fetch_and_parse_pubmed(query, retmax=1): # can change no of articles
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
    record = Entrez.read(handle)
    pubmed_ids = record['IdList']

    fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pubmed_ids), rettype="medline", retmode="text")
    records = Medline.parse(fetch_handle)

    for record in records:
        title = record.get("TI", "No title available")
        abstract = record.get("AB", "No abstract available")
        keywords = record.get("OT", "No keywords available")
        authors = record.get("AU", "No authors available")

        print(f"Title: {title}")
        print(f"Abstract: {abstract}")
        print(f"Keywords: {keywords}")
        print(f"Authors: {', '.join(authors)}")
        print('-' * 80)

# Fetch metadata for cancer-related articles
fetch_and_parse_pubmed("cancer")


In [None]:
from Bio import Entrez, Medline
import pandas as pd
import time

# Set your email here for NCBI API access
Entrez.email = "your_email@example.com"

def fetch_pubmed_data(query, batch_size=100, max_results=1000, csv_filename="pubmed_all_data.csv"):
    """
    Fetches PubMed metadata in batches and saves it to a CSV file.

    Parameters:
    - query: The search term for PubMed (e.g., "cancer").
    - batch_size: Number of articles to fetch per request (max 100).
    - max_results: Total number of articles to fetch.
    - csv_filename: The filename to save the data.
    """

    # Initialize variables for search
    all_records = []
    current_start = 0

    # Loop through batches
    while current_start < max_results:
        print(f"Fetching records {current_start + 1} to {current_start + batch_size}")

        # Fetch data using esearch and efetch
        handle = Entrez.esearch(db="pubmed", term=query, retstart=current_start, retmax=batch_size)
        record = Entrez.read(handle)
        pubmed_ids = record['IdList']

        if not pubmed_ids:
            print("No more records found.")
            break

        # Fetch metadata for the current batch of PubMed IDs
        fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pubmed_ids), rettype="medline", retmode="text")
        records = Medline.parse(fetch_handle)

        # Parse each record and extract metadata
        for record in records:
            title = record.get("TI", "No title available")
            abstract = record.get("AB", "No abstract available")
            keywords = record.get("OT", [])  # Keywords are stored in "OT" (Other Terms)
            authors = ", ".join(record.get("AU", []))  # Authors are stored in "AU"
            journal = record.get("JT", "No journal available")
            year = record.get("DP", "No year available")

            # Add metadata to the list
            all_records.append({
                "Title": title,
                "Abstract": abstract,
                "Keywords": "; ".join(keywords) if keywords else "No keywords available",
                "Authors": authors,
                "Journal": journal,
                "Year": year
            })

        # Increment the starting position for the next batch
        current_start += batch_size

        # Save the current batch to CSV
        df = pd.DataFrame(all_records)
        df.to_csv(csv_filename, index=False)
        print(f"Batch saved to {csv_filename}")

        # Optional: Introduce a delay to avoid hitting rate limits
        time.sleep(1)

    print(f"Finished fetching {len(all_records)} records. Data saved to {csv_filename}.")

# Fetch metadata for a large number of publications related to 'cancer' and save to CSV
fetch_pubmed_data(query="cancer", batch_size=100, max_results=1000, csv_filename="pubmed_cancer_all_data.csv")


In [25]:
from Bio import Entrez, Medline
import pandas as pd
import time

# Set your email for NCBI API access
Entrez.email = "your_email@example.com"

def fetch_large_pubmed_data(query, batch_size=1000, total_records=1000000, csv_filename="pubmed_large_data.csv"):
    """
    Fetches PubMed metadata in batches and saves it to a CSV file.

    Parameters:
    - query: The search term for PubMed (e.g., "cancer").
    - batch_size: Number of articles to fetch per request (max 1000 for PubMed API).
    - total_records: Total number of records you want to fetch.
    - csv_filename: The filename to save the data.
    """

    all_records = []
    current_start = 0

    # Loop through batches to fetch data
    while current_start < total_records:
        print(f"Fetching records {current_start + 1} to {current_start + batch_size}")

        try:
            # Fetch data using esearch to get PubMed IDs
            search_handle = Entrez.esearch(db="pubmed", term=query, retstart=current_start, retmax=batch_size)
            search_record = Entrez.read(search_handle)
            pubmed_ids = search_record['IdList']

            if not pubmed_ids:
                print("No more records found.")
                break

            # Fetch metadata for the current batch of PubMed IDs
            fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pubmed_ids), rettype="medline", retmode="text")
            records = Medline.parse(fetch_handle)

            # Parse each record and extract metadata
            for record in records:
                title = record.get("TI", "No title available")
                abstract = record.get("AB", "No abstract available")
                keywords = record.get("OT", [])  # OT for Other Terms (keywords)
                authors = ", ".join(record.get("AU", []))  # AU for Authors
                journal = record.get("JT", "No journal available")
                year = record.get("DP", "No year available")

                # Add metadata to the list
                all_records.append({
                    "Title": title,
                    "Abstract": abstract,
                    "Keywords": "; ".join(keywords) if keywords else "No keywords available",
                    "Authors": authors,
                    "Journal": journal,
                    "Year": year
                })

            # Increment the starting position for the next batch
            current_start += batch_size

            # Convert to a DataFrame and save incrementally to CSV
            df = pd.DataFrame(all_records)
            df.to_csv(csv_filename, mode='a', header=not current_start, index=False)

            # Clear memory by resetting the list
            all_records.clear()

            # Introduce a delay to respect rate limits
            time.sleep(1)

        except Exception as e:
            print(f"Error fetching data: {e}")
            # Optional: Retry logic or continue fetching the next batch
            time.sleep(5)  # Wait before retrying

    print(f"Finished fetching {current_start} records. Data saved to {csv_filename}.")

# Fetch over 1 million records related to 'cancer' and save incrementally
fetch_large_pubmed_data(query="cancer", batch_size=1000, total_records=1000, csv_filename="pubmed_large_cancer_data.csv")


Fetching records 1 to 1000
Finished fetching 1000 records. Data saved to pubmed_large_cancer_data.csv.


In [18]:
from Bio import Entrez, Medline
import pandas as pd

# Set your email for PubMed access
Entrez.email = "your_email@example.com"

# Function to fetch and parse PubMed data, and save to CSV
def fetch_pubmed_to_csv(query, retmax=5, csv_filename="pubmed_data.csv"):
    # Search PubMed for articles related to the query
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
    record = Entrez.read(handle)
    pubmed_ids = record['IdList']

    # Fetch metadata for the PubMed IDs
    fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pubmed_ids), rettype="medline", retmode="text")
    records = Medline.parse(fetch_handle)

    # Create a list to store metadata
    article_list = []

    # Extract relevant fields from each record and add to the list
    for record in records:
        title = record.get("TI", "No title available")
        abstract = record.get("AB", "No abstract available")
        keywords = record.get("OT", [])  # OT for Other Terms (keywords)
        authors = ", ".join(record.get("AU", []))  # AU for Authors

        # Append metadata to the list as a dictionary
        article_list.append({
            "Title": title,
            "Abstract": abstract,
            "Keywords": "; ".join(keywords) if keywords else "No keywords available",
            "Authors": authors
        })

    # Create a Pandas DataFrame from the list of dictionaries
    df = pd.DataFrame(article_list)

    # Save the DataFrame to a CSV file
    df.to_csv(csv_filename, index=False)

    print(f"Data saved to {csv_filename}")

# Fetch metadata for cancer-related articles and save to CSV
fetch_pubmed_to_csv("cancer", retmax=10, csv_filename="pubmed_cancer_data.csv")


Data saved to pubmed_cancer_data.csv


In [6]:
!pip install scholarly

In [9]:
import requests

# Function to fetch metadata (including keywords) from CrossRef using DOI
def get_crossref_metadata(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        title = data['message'].get('title', ['No title'])[0]
        authors = ', '.join([author['given'] + ' ' + author['family'] for author in data['message'].get('author', [])])
        abstract = data['message'].get('abstract', 'No abstract available')
        year = data['message'].get('issued', {}).get('date-parts', [['No year']])[0][0]
        venue = data['message'].get('container-title', ['No venue'])[0]
        keywords = data['message'].get('subject', 'No keywords available')

        # Print extracted metadata
        print(f"Title: {title}")
        print(f"Authors: {authors}")
        print(f"Abstract: {abstract}")
        print(f"Year: {year}")
        print(f"Venue: {venue}")
        print(f"Keywords: {keywords}")
        print('-' * 80)
    else:
        print('Failed to retrieve metadata')

# Example DOI
doi = "10.1001/jama.2020.1585"  # Replace with the actual DOI from Google Scholar metadata
get_crossref_metadata(doi)


Title: Clinical Characteristics of 138 Hospitalized Patients With 2019 Novel Coronavirus–Infected Pneumonia in Wuhan, China
Authors: Dawei Wang, Bo Hu, Chang Hu, Fangfang Zhu, Xing Liu, Jing Zhang, Binbin Wang, Hui Xiang, Zhenshun Cheng, Yong Xiong, Yan Zhao, Yirong Li, Xinghuan Wang, Zhiyong Peng
Abstract: No abstract available
Year: 2020
Venue: JAMA
Keywords: []
--------------------------------------------------------------------------------


In [None]:
# !pip install Bio

In [22]:
from Bio import Entrez

# Set your email to avoid being blocked by NCBI
Entrez.email = "your_email@example.com"

# Function to fetch abstract and keywords from PubMed
def get_pubmed_metadata(title):
    handle = Entrez.esearch(db="pubmed", term=title, retmax=5)
    record = Entrez.read(handle)
    handle.close()

    if record['IdList']:
        pubmed_id = record['IdList'][0]
        handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="text")
        pubmed_record = Entrez.read(handle)
        handle.close()

        article = pubmed_record['PubmedArticle'][0]
        title = article['MedlineCitation']['Article']['ArticleTitle']
        abstract = article['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
        keywords = article['MedlineCitation']['KeywordList'][0] if 'KeywordList' in article['MedlineCitation'] else 'No keywords available'

        # Print extracted metadata
        print(f"Title: {title}")
        print(f"Abstract: {abstract}")
        print(f"Keywords: {', '.join(keywords)}")
    else:
        print("No article found for the given title.")

# Example title
title = "Cancer "
get_pubmed_metadata(title)


Title: Anabolic deficits and divergent unfolded protein response underlie skeletal and cardiac muscle growth impairments in the Yoshida hepatoma tumor model of cancer cachexia.
Abstract: Cancer cachexia manifests as whole body wasting, however, the precise mechanisms governing the alterations in skeletal muscle and cardiac anabolism have yet to be fully elucidated. In this study, we explored changes in anabolic processes in both skeletal and cardiac muscles in the Yoshida AH-130 ascites hepatoma model of cancer cachexia. AH-130 tumor-bearing rats experienced significant losses in body weight, skeletal muscle, and heart mass. Skeletal and cardiac muscle loss was associated with decreased ribosomal (r)RNA, and hypophosphorylation of the eukaryotic factor 4E binding protein 1. Endoplasmic reticulum stress was evident by higher activating transcription factor mRNA in skeletal muscle and growth arrest and DNA damage-inducible protein (GADD)34 mRNA in both skeletal and cardiac muscles. Tumor

In [None]:
!pip install serpapi

In [4]:
# import serpapi
from serpapi import GoogleSearch

def fetch_google_scholar_data():
    params = {
        "engine": "google_scholar",  # Specify the Google Scholar engine
        "q": "cancer",               # Query term
        "api_key": "SsocPBoBLEseYidg5Mao1mSA"  # Replace with your SerpAPI key
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    # Loop through the results to extract metadata
    for result in results.get("organic_results", []):
        print(f"Title: {result['title']}")
        print(f"Link: {result['link']}")
        print(f"Snippet: {result['snippet']}")
        print(f"Full Abstract: {result.get('inline_snippet', 'No abstract available')}")
        keywords = result.get('keywords', 'No keywords available')
        print(f"Keywords: {keywords}")
        print("-" * 80)

fetch_google_scholar_data()


In [None]:
# pip install google-search-results


In [None]:

import time
from scholarly import scholarly

search_query = scholarly.search_pubs('cancer')

for i in range(10):  # Limiting to 10 results
    paper = next(search_query)
    print(paper['bib']['title'])
    time.sleep(15)  # Delay to avoid rate-limiting

# from scholarly import scholarly

# # Search for the term "cancer"
# search_query = scholarly.search_pubs('cancer')

# # Iterate through the search results and extract metadata
# for i in range(5):  # Limiting to the first 5 results for simplicity
#     paper = next(search_query)
#     print(f"Title: {paper['bib']['title']}")
#     print(f"Authors: {', '.join(paper['bib']['author'])}")
#     print(f"Abstract: {paper['bib'].get('abstract', 'No abstract available')}")
#     print(f"Year: {paper['bib'].get('pub_year', 'No year available')}")
#     print(f"Venue: {paper['bib'].get('venue', 'No venue available')}")
#     print(f"Number of citations: {paper.get('num_citations', 'No citation count available')}")

#     # Checking for keywords
#     keywords = paper['bib'].get('keywords', 'No keywords available')
#     print(f"Keywords: {keywords}")

#     print(f"Google Scholar URL: {paper['pub_url']}")
#     print('-' * 80)


In [None]:
# Optional: Save the data to a CSV file
import csv

# Create or open a CSV file to store the results
with open('cancer_research_metadata.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Authors', 'Abstract', 'Year', 'Venue', 'Citations', 'URL'])

    search_query = scholarly.search_pubs('cancer')
    for i in range(5):  # Limiting to the first 5 results
        paper = next(search_query)
        title = paper['bib']['title']
        authors = ', '.join(paper['bib']['author'])
        abstract = paper['bib'].get('abstract', 'No abstract available')
        year = paper['bib'].get('pub_year', 'No year available')
        venue = paper['bib'].get('venue', 'No venue available')
        citations = paper.get('num_citations', 'No citation count available')
        url = paper['pub_url']
        writer.writerow([title, authors, abstract, year, venue, citations, url])

print("Metadata extraction complete, and saved to cancer_research_metadata.csv")
