<a href="https://colab.research.google.com/github/ThaSchizNit/nsf-coa-coauthor-generator/blob/main/NSF_COA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
from Bio import Entrez
from datetime import datetime, timedelta
import time
from xml.etree import ElementTree as ET

def get_pubmed_coauthors(author_name, email, months=48, max_pubs=500):
    """
    Fetch co-authors from PubMed publications in the last 'months' months.
    Returns a list of formatted strings: A:,LastName, ForeName,Affiliation,,Year
    """
    Entrez.email = email  # Required by NCBI

    # Date range: last 'months' months
    end_date = datetime.now()
    start_date = end_date - timedelta(days=months * 30.5)  # Approximate
    date_filter = f'("{start_date.strftime("%Y/%m/%d")}":"{end_date.strftime("%Y/%m/%d")}"[Date - Publication])'

    # Search for publications
    query = f'({author_name}[Author]) AND {date_filter}'
    print(f"Searching PubMed for: {query}")

    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_pubs)
    record = Entrez.read(handle)
    handle.close()

    pmids = record['IdList']
    print(f"Found {len(pmids)} publications.")

    if not pmids:
        return []

    coauthors = {}  # (last, fore) -> (affiliation, latest_year)

    for i, pmid in enumerate(pmids, 1):
        print(f"Processing publication {i}/{len(pmids)} (PMID: {pmid})...")

        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        xml = handle.read()
        handle.close()

        root = ET.fromstring(xml)

        # Get publication year
        pub_year = end_date.year
        for date in root.iter('PubDate'):
            year_elem = date.find('Year')
            if year_elem is not None:
                try:
                    pub_year = int(year_elem.text)
                except:
                    pass
                break

        # Extract authors
        for author in root.iter('Author'):
            last_name_elem = author.find('LastName')
            fore_name_elem = author.find('ForeName')
            affil_elem = author.find('.//Affiliation')

            if last_name_elem is None or fore_name_elem is None:
                continue

            last = last_name_elem.text.strip()
            fore = fore_name_elem.text.strip()
            affiliation = affil_elem.text.strip() if affil_elem is not None and affil_elem.text else "Unknown"

            # Skip if this is the main author (case-insensitive match)
            if author_name.lower().replace(" ", "") in f"{last}{fore}".lower().replace(" ", ""):
                continue

            key = (last, fore)
            if key in coauthors:
                existing_affil, existing_year = coauthors[key]
                new_year = max(existing_year, pub_year)
                # Prefer non-Unknown affiliation
                new_affil = affiliation if affiliation != "Unknown" else existing_affil
                coauthors[key] = (new_affil, new_year)
            else:
                coauthors[key] = (affiliation, pub_year)

        time.sleep(0.34)  # Respect NCBI rate limit (~3 requests/sec)

    # Format output using | as delimiter to preserve commas in affiliations
    # A:|LastName|ForeName|Full Affiliation Text||Year
    output = []
    for (last, fore), (affil, year) in sorted(coauthors.items(), key=lambda x: x[0]):
        output.append(f"A:|{last}|{fore}|{affil}||{year}")

    return output

In [None]:
# EDIT THESE:
author_name = "Schisler JC"  # Use PubMed format: LastName Initials (try variations if needed)
your_email = "your.actual.email@unc.edu"  # REPLACE - required by NCBI!

# Run it
coa_lines = get_pubmed_coauthors(author_name, your_email)

# Display results
if coa_lines:
    print("\n=== Copy-paste these into your COA Excel (Table 4) ===\n")
    for line in coa_lines:
        print(line)
    print(f"\nTotal unique co-authors found: {len(coa_lines)}")
    print("\nNote: Manually review affiliations and years. Add any missing pubs/co-authors.")
else:
    print("No co-authors found. Try a different author name format.")

Searching PubMed for: (Schisler JC[Author]) AND ("2022/01/05":"2026/01/08"[Date - Publication])
Found 23 publications.
Processing publication 1/23 (PMID: 40964318)...
Processing publication 2/23 (PMID: 40027613)...
Processing publication 3/23 (PMID: 39806097)...
Processing publication 4/23 (PMID: 39602262)...
Processing publication 5/23 (PMID: 39433125)...
Processing publication 6/23 (PMID: 38862781)...
Processing publication 7/23 (PMID: 38862573)...
Processing publication 8/23 (PMID: 38862484)...
Processing publication 9/23 (PMID: 38651896)...
Processing publication 10/23 (PMID: 38378768)...
Processing publication 11/23 (PMID: 38362342)...
Processing publication 12/23 (PMID: 38331556)...
Processing publication 13/23 (PMID: 38306481)...
Processing publication 14/23 (PMID: 38187602)...
Processing publication 15/23 (PMID: 40741039)...
Processing publication 16/23 (PMID: 37938797)...
Processing publication 17/23 (PMID: 37556555)...
Processing publication 18/23 (PMID: 36682829)...
Processi

In [None]:
import pandas as pd
from google.colab import files

if 'coa_lines' in globals() and coa_lines:
    data = []
    for line in coa_lines:
        line = line.strip()
        if not line.startswith('A:|'):
            continue

        # Split on | — safe because affiliations don't contain |
        parts = line.split('|')

        if len(parts) < 4:
            continue  # Malformed line

        marker = 'A:'
        last_name = parts[1].strip()
        fore_name = parts[2].strip()
        affiliation = parts[3].strip()
        year = parts[5].strip() if len(parts) > 5 else ""

        full_name = f"{last_name}, {fore_name}".strip()

        if not affiliation or affiliation.lower() == "unknown":
            affiliation = "Unknown"

        if full_name:
            data.append(['A:', full_name, affiliation])

    if data:
        df = pd.DataFrame(data, columns=["Marker", "Name", "Organizational Affiliation"])

        # Sort and deduplicate
        df = df.sort_values('Name').reset_index(drop=True)
        df = df.drop_duplicates(subset=['Name', 'Organizational Affiliation'])

        # Save as TSV (tab-separated, safe for Excel)
        df.to_csv("coa_coauthors.tsv", sep='\t', index=False)
        print("SUCCESS! Full affiliations preserved with all commas.")
        print("Download coa_coauthors.tsv and paste columns A:C into your NSF COA Table 4.")

        files.download('coa_coauthors.tsv')

        # Preview
        display(df.head(60))
        print(f"\nTotal unique co-authors: {len(df)}")
    else:
        print("No co-authors found.")
else:
    print("Run Cell 3 (the search) first after updating Cell 2.")

SUCCESS! Full affiliations preserved with all commas.
Download coa_coauthors.tsv and paste columns A:C into your NSF COA Table 4.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Marker,Name,Organizational Affiliation
0,A:,"A C Almeida, Eduardo","Space Biosciences Division, NASA Ames Research..."
1,A:,"Adkins, Joshua N","Biological Science Division, Pacific Northwest..."
2,A:,"Afolayan, Adeleye J","Department of Pediatrics, Children's Research ..."
3,A:,"Afsari, Fatemeh",Department of Medicine-Nephrology & Intelligen...
4,A:,"Agan, Brian","Infectious Disease Clinical Research Program, ..."
5,A:,"Al-Jaber, Maneera Yousef","Anti-Doping Laboratory Qatar, Doha, Qatar."
6,A:,"Al-Maadheed, Mohammed","Anti-Doping Laboratory Qatar, Doha, Qatar."
7,A:,"Albrecht, Lars A",University of North Carolina McAllister Heart ...
8,A:,"Albrecht, Yentli Soto","The Children's Hospital of Philadelphia, Phila..."
9,A:,"Allen, Noah","Department of Biomedical Engineering, Renssela..."



Total unique co-authors: 414
