In [1]:
import sys
import os
import csv
from datetime import datetime
from Bio import Entrez

In [2]:
Entrez.email = "omer@sapir.com"

def search_protein_ncbi(term, number):
    handle = Entrez.esearch(db='protein', term=term, idtype="acc", retmax=number)
    record = Entrez.read(handle)
    handle.close()
    return record["Count"], record["IdList"], len(record["IdList"])

In [3]:
search_protein_ncbi('arabidopsis thaliana', 3)

('3404939', ['WP_347813942.1', 'WP_347813756.1', 'WP_347813549.1'], 3)

In [60]:
def download_protein_ncbi(term, number):
    Count, IdList, len_IdList = search_protein_ncbi(term, number)
    doc_ids = IdList
    
    filenames = []
    os.makedirs('Data', exist_ok=True)  # Ensure the directory exists

    for i, doc_id in enumerate(doc_ids):
        handle = Entrez.efetch(db="protein", id=doc_id, rettype="gb", retmode="text")
        data = handle.read()
        handle.close()
        filename = os.path.join('Data', f"{term}_{doc_id}.gb")
        with open(filename, "w") as file:
            file.write(data)
        filenames.append(filename)

    log_file = 'Search_log.csv'

    file_exists = os.path.isfile('Search_log')
    with open('Search_log', "a", newline="") as csvfile:
        fieldnames = ["date", "term", "max", "total"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow({
            "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "term": term,
            "max": len_IdList,
            "total": Count
        })

In [63]:
download_protein_ncbi('arabidopsis thaliana', 6, 'data_file', 'search_log')

In [28]:
def main():
    if len(sys.argv) != 5:
        exit(f"Usage: {sys.argv[0]} TERM NUMBER DATA_FOLDER SEARCH_LOG")
    
    term = sys.argv[1]
    count = int(sys.argv[2])
    folder = sys.argv[3]
    log = sys.argv[4]

    download_protein_ncbi(term, count, folder, log)

In [None]:
if __name__ == "__main__":
    main()