In [1]:
from Bio import Entrez
import pandas as pd
from unipressed import IdMappingClient
import time
from tqdm import tqdm
tqdm.pandas()
import os
from pathlib import Path

In [2]:
Entrez.email = "john.doe@example.com"
# not more than 10000! Entrez.esearch's retmax parameters cannot go over 10000
#https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
CHUNKSIZE = 2000

In [3]:
species = 'arabidopsis_thaliana'

In [4]:
input_folder = Path(os.path.relpath('.')).absolute() / 'output'

In [5]:
def chunk_list(lst: list, chunksize: int):
    return [lst[i: i + chunksize] for i in range(0, len(lst), chunksize)]

In [9]:
def get_ncbi_gene_id_mapping(gene_names: list[str]):
    
    # making search query string
    search_term = " OR ".join([f"{gene}[Gene Name]" for gene in gene_names])  # Combine gene names with OR
    formated_species_name = species.replace('_', ' ').capitalize()
    search_term += f" AND {formated_species_name}[Organism]"
    
    # Perform the esearch query
    with Entrez.esearch(db="gene", term=search_term, retmode="xml", retmax=len(gene_names)) as handle:
        search_results = Entrez.read(handle)
    
    # Step 2: Fetch detailed information about the gene IDs
    gene_ids = search_results["IdList"] 
    
    # Fetch detailed information for each gene ID using efetch or esummary
    with Entrez.efetch(db="gene", id=",".join(gene_ids), retmode="xml") as handle:
        gene_records = Entrez.read(handle)
        
    # just in case
    if len(gene_records) < len(gene_names):
        raise RuntimeError('Did not fetch as many records as gene names!')
    
    # Step 3: Map the gene names to their corresponding IDs by retrieving the official gene symbol
    gene_mapping = {}
    for record in gene_records:
        
        gene_ref_string_values = extract_strings(record['Entrezgene_gene']['Gene-ref'])
        
        gene_name = None
        for string in gene_ref_string_values:
            if string in gene_names:
                gene_name = string
        if gene_name is None:
            raise ValueError(f'Could not find gene name in {gene_ref_string_values}')
        
        gene_id = record['Entrezgene_track-info']['Gene-track']['Gene-track_geneid']
        gene_mapping[gene_name] = gene_id
    
    return gene_mapping

In [10]:
def extract_strings(d):
    # List to collect all string values
    strings = []

    # Recursive function to traverse the dictionary or list
    def recursive_extractor(value):
        if isinstance(value, dict):  # If it's a dictionary, iterate over its values
            for v in value.values():
                recursive_extractor(v)
        elif isinstance(value, list):  # If it's a list, iterate over its elements
            for item in value:
                recursive_extractor(item)
        elif isinstance(value, str):  # If it's a string, collect it
            strings.append(value)

    # Start the recursion with the input dictionary
    recursive_extractor(d)

    return strings

In [11]:
dfs = []
for file in input_folder.iterdir():
    
    print(f'Parsing {file.name}')
    df = pd.read_csv(file, header=0, index_col=0)
    gene_ids = df.index.tolist()
    
    print('Mapping gene names to NCBI gene IDs')
    gene_ids = gene_ids[:3000]
    chunks = chunk_list(gene_ids, chunksize=CHUNKSIZE)
    mapping_dict = {}
    for chunk_gene_names in tqdm(chunks):
        # converting to uniprot IDs for all IDs comprised in this chunk
        gene_mapping = get_ncbi_gene_id_mapping(chunk_gene_names)
        mapping_dict.update(gene_mapping)
        
    df.index = df.index.map(lambda x: mapping_dict.get(x, x))
    
    dfs.append(df)

Parsing E-GEOD-51720_rnaseq.csv
Mapping Gene IDs to Uniprot IDs


100%|██████████| 2/2 [01:10<00:00, 35.33s/it]


{'AT1G09570': '837483', 'AT1G09530': '837479', 'AT1G04400': '839529', 'AT1G09700': '837498', 'AT1G01040': '839574', 'AT1G08090': '837327', 'AT1G01060': '839341', 'AT1G02280': '839248', 'AT1G08720': '837393', 'AT1G04250': '839568', 'AT1G02580': '839422', 'AT1G07890': '837304', 'AT1G04240': '839570', 'AT1G01370': '839104', 'AT1G08430': '837363', 'AT1G04750': '839419', 'AT1G02340': '839300', 'AT1G08060': '837322', 'AT1G02900': '839389', 'AT1G01510': '839401', 'AT1G06040': '837113', 'AT1G08550': '837377', 'AT1G05180': '839286', 'AT1G01720': '839265', 'AT1G01360': '838452', 'AT1G01183': '3766638', 'AT1G02860': '839559', 'AT1G08830': '837405', 'AT1G09200': '837440', 'AT1G04550': '839495', 'AT1G01580': '839411', 'AT1G09340': '837455', 'AT1G06680': '837178', 'AT1G05850': '837095', 'AT1G06390': '837150', 'AT1G06160': '837125', 'AT1G08260': '837346', 'AT1G06950': '837205', 'AT1G05200': '839268', 'AT1G02970': '839453', 'AT1G01480': '837082', 'AT1G08810': '837403', 'AT1G08560': '837378', 'AT1G0284

100%|██████████| 2/2 [00:34<00:00, 17.41s/it]

{'AT1G09570': '837483', 'AT1G09530': '837479', 'AT1G04400': '839529', 'AT1G09700': '837498', 'AT1G01040': '839574', 'AT1G08090': '837327', 'AT1G01060': '839341', 'AT1G02280': '839248', 'AT1G08720': '837393', 'AT1G04250': '839568', 'AT1G02580': '839422', 'AT1G07890': '837304', 'AT1G04240': '839570', 'AT1G01370': '839104', 'AT1G08430': '837363', 'AT1G04750': '839419', 'AT1G02340': '839300', 'AT1G08060': '837322', 'AT1G02900': '839389', 'AT1G01510': '839401', 'AT1G06040': '837113', 'AT1G08550': '837377', 'AT1G05180': '839286', 'AT1G01720': '839265', 'AT1G01360': '838452', 'AT1G01183': '3766638', 'AT1G02860': '839559', 'AT1G08830': '837405', 'AT1G09200': '837440', 'AT1G04550': '839495', 'AT1G01580': '839411', 'AT1G09340': '837455', 'AT1G06680': '837178', 'AT1G05850': '837095', 'AT1G06390': '837150', 'AT1G06160': '837125', 'AT1G08260': '837346', 'AT1G06950': '837205', 'AT1G05200': '839268', 'AT1G02970': '839453', 'AT1G01480': '837082', 'AT1G08810': '837403', 'AT1G08560': '837378', 'AT1G0284




In [12]:
concat_df = pd.concat(dfs, axis=1)

In [13]:
concat_df

Unnamed: 0,SRR1016626,SRR1016627,SRR1016628,SRR1016629,SRR1016630,SRR1016631,SRR1016632,SRR1016633,SRR1016634,SRR1016635,...,SRR309177,SRR309178,SRR309179,SRR309180,SRR309181,SRR309182,SRR309183,SRR309184,SRR309185,SRR309186
839580,0,0,0,0,0,0,0,0,0,0,...,1,5,3,2,4,0,1,6,3,3
839569,0,0,1,0,0,0,0,0,0,0,...,6,2,8,4,9,2,3,13,17,10
839321,0,0,0,0,0,0,0,0,0,0,...,9,5,6,4,4,0,4,5,6,5
839574,2,0,0,0,0,0,0,0,0,2,...,32,44,24,20,41,14,34,46,39,30
6240410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATMG09730,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATMG09740,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
ATMG09950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATMG09960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
