In [None]:
import arxiv
import requests
from Bio import Entrez
import json
import os
import pandas as pd

from tqdm import tqdm
import logging

# Configure the logging format and level
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
)

# Fetching logic

### Fetch from ncbi

In [None]:
# Configure Entrez for NCBI API
Entrez.email = "anton@gmail.com"  # Replace with your email

# Function to fetch NCBI (PubMed) papers
def fetch_ncbi_papers(query):
    logging.info('ncbi searching...')
    handle = Entrez.esearch(db="pubmed", 
                            term=query, 
                            retmax=1000000, 
                            mindate="2023", 
                            maxdate="2024")
    record = Entrez.read(handle)
    id_list = record["IdList"]
    handle.close()

    handle = Entrez.efetch(db="pubmed", id=",".join(id_list), retmode="xml")
    records = Entrez.read(handle)
    results = []
    for article in tqdm(records['PubmedArticle']):
        title = article['MedlineCitation']['Article']['ArticleTitle']
        abstract = ""
        try:
            abstract = " ".join(article['MedlineCitation']['Article']['Abstract']['AbstractText'])
        except KeyError:
            pass
        
        # Extract publication date
        published = ""
        try:
            pub_date = article['MedlineCitation']['Article']['ArticleDate'][0]
            published = f"{pub_date['Year']}-{pub_date['Month']}-{pub_date['Day']}" if pub_date else "Unknown"
        except IndexError:
            pass

        # Extract DOI (to construct the URL)
        doi = None
        for id_tag in article['PubmedData']['ArticleIdList']:
            if id_tag.attributes['IdType'] == 'doi':
                doi = str(id_tag)
                break
        url = f"https://doi.org/{doi}" if doi else "No DOI"

        results.append({
            'title': title,
            'abstract': abstract,
            'url': url,
            'published': published,
            'source': 'PubMed'
        })
    return results

### Fetch from medrxiv and biorxiv

In [None]:
# Function to fetch bioRxiv and medRxiv papers
# Function to fetch papers from bioRxiv or medRxiv
def fetch_papers_rxiv(server, 
                      start_date, 
                      end_date, 
                      query_terms_list):
    results = []
    cursor = 0
    total_papers = None
    new_papers_count = None
    while True:
        # Construct the API URL with the correct format, iterating over pages using the cursor
        api_url = f"https://api.biorxiv.org/details/{server}/{start_date}/{end_date}/{cursor}/json"
        
        # Set headers to mimic a legitimate browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        # Make the API request
        response = requests.get(api_url, headers=headers)
        logging.info(f"server: {server}; cursor = {cursor}, total = {total_papers}, new = {new_papers_count}, added = {len(results)}")
        if response.status_code == 200:
            data = response.json()
            messages = data.get('messages', [{}])[0]  # Get the first message

            # Extract pagination metadata
            total_papers = messages.get('total', None)  # Total papers in the database
            new_papers_count = messages.get('count_new_papers', None)  # New papers in the range
            
            papers = data.get('collection', [])
            
            # Filter the papers based on the provided query terms
            for paper in papers:
                title = paper['title']
                abstract = paper['abstract']
                published_date = paper['date']
                doi = paper['doi']
                url = f"https://doi.org/{doi}"
                authors = paper['authors']
                flag = True
                for query_terms in query_terms_list: 
                # Check if any query term matches the title or abstract
                    if not any(term.lower() in title.lower() or term.lower() in abstract.lower() for term in query_terms):
                        flag = False
                if flag:
                    results.append({
                        'title': title,
                        'abstract': abstract,
                        'url': url,
                        'published': published_date,
                        'authors': authors,
                        'source': server
                    })
            
            # Pagination: Check if we should continue to the next page
            if len(papers) < 100:
                break  # Stop if there are fewer than 100 papers or no more cursor
            # Update the cursor for the next request
            cursor += 100  # Increment the cursor by 100 to get the next page
        else:
            logging.warning(f"Error {response.status_code}: {response.text}")
            break

    return results

# Function to loop through bioRxiv and medRxiv
def fetch_rxiv_both_servers(start_date, end_date, query_terms_list):
    all_results = []

    # Loop through both servers: 'biorxiv' and 'medrxiv'
    for server in ['biorxiv', 'medrxiv']:
        logging.info(f"Fetching papers from {server}...")
        server_results = fetch_papers_rxiv(server, start_date, end_date, query_terms_list)
        all_results.extend(server_results)

    return all_results

### Fetch from arxiv

In [None]:
# Function to fetch arXiv papers
def fetch_arxiv_papers(query):
    logging.info('Arxiv searching...')
    search = arxiv.Search(
        query=query,
        max_results=10000,  # arXiv has a limit of 1000 per query
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending,
    )
    results = []
    i = 0
    for result in tqdm(search.results()):
        if result.published.year in [2023, 2024]:
            results.append({
                'title': result.title,
                'abstract': result.summary,
                'url': result.entry_id,
                'published': result.published,
                'source': 'arXiv'
            })
        i+=1
        if i%100==0:
            sleep(20)
    return results


# Main loop to fetch data for each term in the first list
def fetch_and_merge_articles(query_terms_1, query_terms_2, dir_path='./data'):
    all_results = []
    
    for term_1 in query_terms_1:
        query = create_search_query(term_1, query_terms_2)
        logging.info(f"Searching for: {query}")
        
        # Fetch articles for this query
        current_results = fetch_arxiv_papers(query)
        
        # Append the results
        all_results.extend(current_results)
        
        # Save progress after each term
        save_collected(all_results, os.path.join(dir_path, f'0_arxiv_{term_1}.csv'))
        sleep(30)

    return all_results

def remove_duplicates(articles):
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(articles)
    
    # Drop duplicates based on title, abstract, and url (which should be unique)
    df_unique = df.drop_duplicates(subset=['title', 'abstract', 'url'])
    
    return df_unique

# Load data using functions

In [None]:
def create_search_query(query_terms):
    # Helper function to format the query terms, adding quotes around multi-word terms
    def format_terms(terms):
        return " OR ".join([f"\"{term}\"" if " " in term else term for term in terms])

    # Format the two sets of terms with OR between them
    query_part_1 = format_terms(query_terms[0])
    query_part_2 = format_terms(query_terms[1])

    # Combine the two parts with AND between them
    final_query = f"({query_part_1}) AND ({query_part_2})"
    
    return final_query

### Prepare

In [None]:
start_date = "2023-01-01"
end_date = "2024-12-31"
query_terms_list = [
    ["genomic", "genetic", "inherited", "hereditary", "heredity", "inheritance", "heritability", 
     "NGS", "next genome sequencing", "phenotype description", "variant interpretation", "complex trait",
     "medicine", "medical", "clinical decision",  "diagnosis", "diagnostic", "clinical", "syndrome"],
    ["LLM", "large language model", "NLP", "natural language processing",
     "GPT", "chatGPT", "transformer", "BERT", "Bidirectional Encoder Representation", 
     "RAG", "retrieval-augmented generation", "retrieval augmented generation", 
     "generative AI", "AI assistant", "prompt", "chatbot", "prompt engineering", 
     "attention mechanism", "chain-of-thought", "chain of thought"]
]
query = create_search_query(query_terms_list)

### Load from ncbi

In [None]:
# now actually search
logging.info("JUST STARTED")

ncbi_papers = fetch_ncbi_papers(query)
logging.info(f"FINISHED ncbi, len: {len(ncbi_papers)}")
save_collected(ncbi_papers, 'data/0_ncbi.csv')

### Load from biorxiv and medrxiv

In [None]:
# now actually search
logging.info("JUST STARTED")

bio_medrxiv_papers = fetch_rxiv_both_servers(start_date, end_date, query_terms_list)
logging.info(f"FINISHED biomedrxiv, len: {len(bio_medrxiv_papers)}")
save_collected(bio_medrxiv_papers, 'data/0_bio_med.csv')

### Load from arxiv

Since arxiv have limits of `1000` returned articles per query, sometimes, you need to specify queries (as it's implemented here):

In [None]:
logging.info("Starting article collection...")
all_arxiv_papers = fetch_and_merge_articles(query_terms_list[0], query_terms_list[1], dir_path='./data')
logging.info(f"Finished fetching papers, total found: {len(all_arxiv_papers)}")

Specific subsamples were saved with tha mask `0_arxiv_*.csv`, we can save them all like this:

In [None]:
# Remove duplicates
unique_papers = remove_duplicates(all_arxiv_papers)
logging.info(f"Total unique papers: {len(unique_papers)}")

# Save the final result
save_collected(unique_papers.to_dict('records'), 'data/0_arxiv_unique.csv')

Alternatively (if we did fetching several times), we can merge all together afterwards, 

In [None]:
def process_arxiv_files(directory_path='./', pattern='0_arxiv_*.csv', output_file='0_arxiv_unique.csv'):


    # Get a list of all files that match the pattern
    file_list = [file for file in os.listdir(directory_path) if file.startswith('0_arxiv_') and file.endswith('.csv')]

    # If no files found, return a message
    if not file_list:
        return "No files found matching the pattern."

    # Read all the CSV files and set the first column as the index
    dfs = [pd.read_csv(os.path.join(directory_path, file), index_col=0) for file in file_list]

    # Concatenate all the DataFrames into one
    combined_df = pd.concat(dfs, ignore_index=True)

    # Get initial number of rows
    initial_row_count = len(combined_df)
    print(f"Initial number of rows: {initial_row_count}")

    # Drop NaN values and duplicates
    combined_df.drop_duplicates(inplace=True)

    # Get final number of rows after cleaning
    final_row_count = len(combined_df)
    print(f"Final number of rows after cleaning: {final_row_count}")

    # Save the final DataFrame to a CSV file
    combined_df.to_csv(os.path.join(directory_path, output_file), index=False)

    return initial_row_count, final_row_count, output_file

# Call the function and return the result
process_arxiv_files(directory_path='./data', pattern='0_arxiv_*.csv', output_file='0_arxiv_unique.csv')