In [10]:
import pandas as pd
from unipressed import IdMappingClient
import time
from tqdm import tqdm
tqdm.pandas()
import os
from pathlib import Path

In [50]:
GENERAL_DB_SOURCES = [
    'Gene_Name',
    'GeneID',
    'EMBL-GenBank-DDBJ',
    'RefSeq_Nucleotide'
    'Ensembl',
    'GI_number',
    'Ensembl_Genomes',
    'Ensembl_Genomes_Transcript',
    'Ensembl_Transcript',
    'EMBL-GenBank-DDBJ_CDS'
]

"""
FAMILY_SPECIFIC_DB_SOURCES = [
    'PATRIC',
    'UCSC',
    'WBParaSite',
    'FlyBase'
]
"""

"""
SPECIES_SPECIFIC_DB_SOURCES = {
    'arabidopsis_thaliana ': 'Araport',
    'dictyBase': 'dictyostelium_discoideum',
    'HGNC',
    'MaizeGDB': 'zea_mays',
    'GeneCards',
    'MGI',
    'PomBase',
    'RGD',
    'SGD',
    'WormBase',
    'ZFIN': 'danio_rerio',
    'Xenbase': 'xenopus'
    'GeneWiki',
    'GeneTree'
}
"""

DB_SOURCE_TEST_SAMPLE_SIZE = 10
CHUNKSIZE = 2000

In [51]:
input_folder = Path(os.path.relpath('.')).absolute() / 'output'

In [58]:
def chunk_list(lst: list, chunksize: int):
    return [lst[i: i + chunksize] for i in range(0, len(lst), chunksize)]

def map_ids(ids: list[str], source: str, dest: str):
    request = IdMappingClient.submit(source=source, dest=dest, ids=ids)
    while request.get_status() != 'FINISHED':
        time.sleep(1)
    return {result_dict['from'] : result_dict['to'] for result_dict in request.each_result()}

def map_to_uniprot_ids(ids: list[str], source: str) -> dict:    
    return map_ids(ids, source, dest="UniProtKB")

def map_uniprot_ids_to_genbank_ids(ids: list[str]) -> dict:    
    return map_ids(ids, source="UniProtKB_AC-ID", dest="EMBL-GenBank-DDBJ")

def get_uniprot_id_mapping_source_db(gene_ids: list[str]):
    for db_source in GENERAL_DB_SOURCES:
        
        try:
            result = map_to_uniprot_ids(gene_ids, db_source)
        except KeyError:
            print(f'Got issue while trying to map {db_source} to destination database. Skipping...')
            continue
            
        if not result:
            continue
        else:
            print(f'Found mapping with {db_source} database')
            if len(result) < len(gene_ids):
                missing_ids = list(set(gene_ids) - set(result.keys()))
                print(f'Mapping with db {db_source} not complete! Missing ids: {missing_ids}')
            return db_source
    raise RuntimeError(f'Could not a find a suitable source database for {gene_ids}')

In [60]:
dfs = []
for file in input_folder.iterdir():
    
    print(f'Parsing {file.name}')
    df = pd.read_csv(file, header=0, index_col=0)
    gene_ids = df.index.tolist()
    
    print('Getting source db')
    first_gene_ids = gene_ids[:DB_SOURCE_TEST_SAMPLE_SIZE]
    source = get_uniprot_id_mapping_source_db(first_gene_ids)
    
    print('Mapping Gene IDs to Uniprot IDs')
    gene_ids = gene_ids[:300]
    
    chunks = chunk_list(gene_ids, chunksize=CHUNKSIZE)
    mapping_dict = {}
    for chunk in tqdm(chunks):
        # converting to uniprot IDs for all IDs comprised in this chunk
        mapping = map_to_uniprot_ids(chunk, source)
        mapping_dict.update(mapping)
        
    df.index = df.index.map(lambda x: mapping_dict.get(x, x))
    
    """
    print('Mapping Uniprot IDs to Genbank IDs')
    uniprot_ids = df.index.tolist()
    chunks = chunk_list(uniprot_ids, chunksize=CHUNKSIZE)
    mapping_dict = {}
    for chunk in tqdm(chunks):
        # converting to uniprot IDs for all IDs comprised in this chunk
        mapping = map_uniprot_ids_to_genbank_ids(chunk)
        mapping_dict.update(mapping)
        
    df.index = df.index.map(lambda x: mapping_dict.get(x, x))
    """
    
    dfs.append(df)
    

Parsing E-GEOD-51720_rnaseq.csv
Getting source db
Found mapping with Gene_Name database
Mapping with db Gene_Name not complete! Missing ids: ['AT1G01046']
Mapping Gene IDs to Uniprot IDs


100%|██████████| 1/1 [00:04<00:00,  4.14s/it]


Parsing E-GEOD-30720_rnaseq.csv
Getting source db
Found mapping with Gene_Name database
Mapping with db Gene_Name not complete! Missing ids: ['AT1G01046']
Mapping Gene IDs to Uniprot IDs


100%|██████████| 1/1 [00:04<00:00,  4.50s/it]


In [61]:
concat_df = pd.concat(dfs, axis=1)

In [62]:
concat_df

Unnamed: 0,SRR1016626,SRR1016627,SRR1016628,SRR1016629,SRR1016630,SRR1016631,SRR1016632,SRR1016633,SRR1016634,SRR1016635,...,SRR309177,SRR309178,SRR309179,SRR309180,SRR309181,SRR309182,SRR309183,SRR309184,SRR309185,SRR309186
Q0WV96,0,0,0,0,0,0,0,0,0,0,...,1,5,3,2,4,0,1,6,3,3
F4HQG4,0,0,1,0,0,0,0,0,0,0,...,6,2,8,4,9,2,3,13,17,10
C3VMM4,0,0,0,0,0,0,0,0,0,0,...,9,5,6,4,4,0,4,5,6,5
Q56Y50,2,0,0,0,0,0,0,0,0,2,...,32,44,24,20,41,14,34,46,39,30
AT1G01046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATMG09730,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATMG09740,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
ATMG09950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATMG09960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
