In [1]:
import os
import re
os.chdir('..')

In [2]:
import pandas as pd
nodes = pd.read_csv('ncbi_taxdump/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]
names = pd.read_csv('ncbi_taxdump/names.dmp', sep='\t', header=None).iloc[:, [0,2,6]]
nodes.columns = ['TaxID', 'ParentID', 'Rank']
names.columns = ['TaxID', 'Name', 'Name_type']
names['Name'] = names['Name'].str.lower().str.replace('candidatus ', '')
ncbi_table = names.merge(nodes, 'left', on='TaxID')
tax_levels_dict = {'superkingdom': 'k', 'kingdom':'k', 'phylum':'p', 'class':'c', 'order':'o', 'family':'f', 'genus':'g', 'species':'s'}

def rename_taxa(row):
    if row['Rank'] in tax_levels_dict.keys():
        return tax_levels_dict[row['Rank']] + '__' + row['Name']
    else:
        return row['Name']
    
ncbi_table['Name'] = ncbi_table.apply(lambda row: rename_taxa(row), axis=1)
ncbi_table.loc[~ncbi_table['Name'].str.contains('__'), 'TaxID'] = ''
ncbi_table = ncbi_table[['TaxID', 'Name']]
ncbi_table.to_csv('standardized_databases/NCBI_table.tsv', sep='\t', index=None)

  nodes = pd.read_csv('ncbi_taxdump/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]


In [42]:
# Standardize Centrifuge

seqid2taxid = pd.read_csv('raw_databases/Centrifuge/seqid2taxid.map', delim_whitespace=True, header=None)
seqid2taxid.columns = ['seqid', 'taxid']
nodes = pd.read_csv('raw_databases/Centrifuge/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]
names = pd.read_csv('raw_databases/Centrifuge/names.dmp', sep='\t', header=None).iloc[:, [0,2,6]]
nodes.columns = ['TaxID', 'ParentID', 'Rank']
names.columns = ['TaxID', 'Name', 'Name_type']
names['Name'] = names['Name'].str.lower().str.replace('candidatus ', '')
names = names[names['Name_type'] == 'scientific name']

id_parent_dict = {key: value for key, value in zip(nodes['TaxID'].tolist(), nodes['ParentID'].tolist())}
full_taxid_list = list(set(seqid2taxid['taxid'].tolist()))
print(len(full_taxid_list))

i = 0
while i < len(full_taxid_list):
    current_id = int(str(full_taxid_list[i]).split('|')[0])
    try:
        if id_parent_dict[current_id] != 1 and id_parent_dict[current_id] != 131567:
            if id_parent_dict[current_id] not in set(full_taxid_list):
                full_taxid_list.append(id_parent_dict[current_id])
            full_taxid_list[i] = str(id_parent_dict[current_id]) + '|' + str(full_taxid_list[i])
        else:
            i += 1
            if i % 1000 == 0:
                print(str(i/len(full_taxid_list) * 100) + '% done')
    except: # TaxID not found
        del full_taxid_list[i]

taxa_table = names.merge(nodes, 'left', on='TaxID')
def rename_taxa(row):
    if row['Rank'] in tax_levels_dict.keys():
        return tax_levels_dict[row['Rank']] + '__' + row['Name']
    else:
        return ''
taxa_table['Name'] = taxa_table.apply(lambda row: rename_taxa(row), axis=1)


taxid_to_name_dict = {key: value for key, value in zip(taxa_table['TaxID'].tolist(), taxa_table['Name'].tolist())}
taxa_list = ['|'.join([taxid_to_name_dict[int(single_taxid)].replace(' ', '_') for single_taxid in str(taxid).split('|') if taxid_to_name_dict[int(single_taxid)] != '']) for taxid in full_taxid_list]
taxa_list
        
centrifuge = pd.DataFrame.from_dict({'Taxa':taxa_list, 'TaxID':full_taxid_list})
centrifuge = centrifuge[centrifuge.TaxID.str.split('|').str.get(0).str.contains('2157$') | centrifuge.TaxID.str.split('|').str.get(0).str.contains('2$') | centrifuge.TaxID.str.split('|').str.get(0).str.contains('10239$')]
centrifuge.drop_duplicates()
centrifuge.to_csv('standardized_databases/Centrifuge.tsv', sep='\t', index=None)


  nodes = pd.read_csv('raw_databases/Centrifuge/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]


21194
4.464883689779882% done
8.495815810713223% done
12.36450562585006% done
15.893829220805022% done
19.296079036739737% done
22.621875353466802% done
25.770349372307923% done
28.87356985599307% done
31.767322014754157% done
34.5339641537452% done
37.254038676465605% done
39.95471798628221% done
42.51144538914323% done
44.99871432244793% done
47.86826653050804% done
50.25125628140703% done
52.37699109591152% done
54.59177483925755% done
56.73843581091169% done
58.85468777588135% done
60.98800569221387% done
63.2783961802859% done
65.61305414503337% done
67.94632240529981% done
70.30964367072588% done
72.68051323623962% done
75.1105794642112% done
77.44433688286544% done
79.87000468203476% done
82.20529402093494% done
84.53315881326353% done
86.85267614808382% done
89.21812479723154% done
91.51100823599074% done
93.53037064749739% done
95.31626466149487% done
97.26603575184016% done
99.02796236937431% done


In [43]:
rank_prefixes = ['k', 'p', 'c', 'o', 'f', 'g', 's', 't']
conversion_dict = {key.replace(' ', '_').replace('.', ''): str(value) for key, value in zip(ncbi_table['Name'], ncbi_table['TaxID'])}

# Standardize GTDB-Tk

df = pd.read_csv('raw_databases/GTDB-Tk/gtdb_taxonomy.tsv', sep='\t', header=None)
df.columns = ['Junk', 'Taxa']
df = df[['Taxa']]
df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace(".__", "").str.lower().str.replace('_[a-z]\|', '|').str.replace(' ', '_').str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 1].rsplit('|', 1)[0] not in df['TaxID'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit(';', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
gtdbtk = df.copy()
gtdbtk['Taxa'] = gtdbtk['Taxa'].str.replace(';', '|').str.replace('d__', 'k__').str.replace(' ', '_')
gtdbtk = gtdbtk[gtdbtk.TaxID.str.split('|').str.get(0).str.contains('2157$') | gtdbtk.TaxID.str.split('|').str.get(0).str.contains('2$') | gtdbtk.TaxID.str.split('|').str.get(0).str.contains('10239$')]
gtdbtk = gtdbtk.drop_duplicates()
gtdbtk.to_csv('standardized_databases/GTDBTk.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace(".__", "").str.lower().str.replace('_[a-z]\|', '|').str.replace(' ', '_').str.replace('candidatus_', '')


In [44]:
# Standardize Kraken

seqid2taxid = pd.read_csv('raw_databases/Kraken 2 Bracken 2/seqid2taxid.map', sep='\t', header=None)
seqid2taxid.columns = ['seqid', 'taxid']
nodes = pd.read_csv('raw_databases/Kraken 2 Bracken 2/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]
names = pd.read_csv('raw_databases/Kraken 2 Bracken 2/names.dmp', sep='\t', header=None).iloc[:, [0,2,6]]
nodes.columns = ['TaxID', 'ParentID', 'Rank']
names.columns = ['TaxID', 'Name', 'Name_type']
names['Name'] = names['Name'].str.lower().str.replace('candidatus ', '')
names = names[names['Name_type'] == 'scientific name']

id_parent_dict = {key: value for key, value in zip(nodes['TaxID'].tolist(), nodes['ParentID'].tolist())}
full_taxid_list = list(set(seqid2taxid['taxid'].tolist()))
print(len(full_taxid_list))

i = 0
while i < len(full_taxid_list):
    current_id = int(str(full_taxid_list[i]).split('|')[0])
    try:
        if id_parent_dict[current_id] != 1 and id_parent_dict[current_id] != 131567:
            if id_parent_dict[current_id] not in set(full_taxid_list):
                full_taxid_list.append(id_parent_dict[current_id])
            full_taxid_list[i] = str(id_parent_dict[current_id]) + '|' + str(full_taxid_list[i])
        else:
            i += 1
            if i % 1000 == 0:
                print(str(i/len(full_taxid_list) * 100) + '% done')
    except: # TaxID not found
        del full_taxid_list[i]

taxa_table = names.merge(nodes, 'left', on='TaxID')
def rename_taxa(row):
    if row['Rank'] in tax_levels_dict.keys():
        return tax_levels_dict[row['Rank']] + '__' + row['Name']
    else:
        return ''
taxa_table['Name'] = taxa_table.apply(lambda row: rename_taxa(row), axis=1)


taxid_to_name_dict = {key: value for key, value in zip(taxa_table['TaxID'].tolist(), taxa_table['Name'].tolist())}
taxa_list = ['|'.join([taxid_to_name_dict[int(single_taxid)].replace(' ', '_') for single_taxid in str(taxid).split('|') if taxid_to_name_dict[int(single_taxid)] != '']) for taxid in full_taxid_list]
taxa_list
        
kraken = pd.DataFrame.from_dict({'Taxa':taxa_list, 'TaxID':full_taxid_list})
kraken = kraken[kraken.TaxID.str.split('|').str.get(0).str.contains('2157$') | kraken.TaxID.str.split('|').str.get(0).str.contains('2$') | kraken.TaxID.str.split('|').str.get(0).str.contains('10239$')]
kraken.drop_duplicates()
kraken.to_csv('standardized_databases/KrakenBracken.tsv', sep='\t', index=None)


  nodes = pd.read_csv('raw_databases/Kraken 2 Bracken 2/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]


23639
4.037956793862306% done
7.731859125526733% done
11.26041588469334% done
14.671361502347418% done
17.9057441627274% done
21.085184143941525% done
24.132938012824933% done
27.062683941679914% done
30.001000033334446% done
32.784735427185105% done
35.47128438296088% done
38.04089396100808% done
40.67075459892379% done
43.200543092541736% done
45.61072764314167% done
48.11161895597787% done
50.76748491907066% done
52.99105040037683% done
55.17802172271592% done
57.441553219599065% done
59.61505706012604% done
61.662649251639664% done
63.86405286832898% done
65.91776758493779% done
68.11989100817438% done
70.3786914977127% done
72.55723960012898% done
74.80230818550973% done
77.066170608557% done
79.25186241876683% done
81.48673869042925% done
83.70170803797964% done
85.87935252173008% done
88.07149332987954% done
90.28297263136173% done
92.4000924000924% done
94.15956228527801% done
95.88211546225273% done
97.87436946319673% done
99.46042718253474% done


In [4]:
# Standardize MetaPhlAn 2
df = pd.read_csv('raw_databases/MetaPhlAn 2/MPA2Taxonomy.csv', header=None)
df.columns = ['Taxa']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
                
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 1].rsplit('|', 1)[0] not in df['TaxID'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaphlan2 = df.copy()
# Select ABV
metaphlan2 = metaphlan2[metaphlan2.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaphlan2.TaxID.str.split('|').str.get(0).str.contains('2$') | metaphlan2.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaphlan2 = metaphlan2.drop_duplicates()
metaphlan2.to_csv('standardized_databases/MetaPhlAn2.tsv', sep='\t', index=None)


  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')


In [5]:
# Standardize MetaPhlAn 3

df = pd.read_csv('raw_databases/MetaPhlAn 3/MPA3Taxonomy.csv', header=None)
df.columns = ['Taxa']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
                
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 1].rsplit('|', 1)[0] not in df['TaxID'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaphlan3 = df.copy()
metaphlan3 = metaphlan3[metaphlan3.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaphlan3.TaxID.str.split('|').str.get(0).str.contains('2$') | metaphlan3.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaphlan3 = metaphlan3.drop_duplicates()
metaphlan3.to_csv('standardized_databases/MetaPhlAn3.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')


In [6]:
# Standardize MetaPhlAn 4

df = pd.read_csv('raw_databases/MetaPhlAn 4/MPA4Taxonomy.csv', header=None)
df.columns = ['Taxa']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
                
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 1].rsplit('|', 1)[0] not in df['TaxID'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaphlan4 = df.copy()
metaphlan4 = metaphlan4[metaphlan4.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaphlan4.TaxID.str.split('|').str.get(0).str.contains('2$') | metaphlan4.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaphlan4 = metaphlan4.drop_duplicates()
metaphlan4.to_csv('standardized_databases/MetaPhlAn4.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')


In [10]:
# Standardize Metaxa 2

df = pd.read_csv('raw_databases/Metaxa 2/blast.taxonomy.txt', sep='\t', header=None)
df.columns = ['Junk', 'Taxa']
df = df[['Taxa']]
df['Taxa'] = df['Taxa'].str.strip(';').str.replace(' ', '_')
df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace('\\.', '').str.replace(' ', '_').str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')

df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 1].rsplit('|', 1)[0] not in df['TaxID'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit(';', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaxa2 = df.copy()
metaxa2 = metaxa2[metaxa2.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaxa2.TaxID.str.split('|').str.get(0).str.contains('2$') | metaxa2.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaxa2 = metaxa2.drop_duplicates()
metaxa2['Taxa'] = 'k__' + metaxa2['Taxa'].astype(str)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|p__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|c__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|o__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|f__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|g__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|s__', 1)
metaxa2.to_csv('standardized_databases/Metaxa2.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace('\\.', '').str.replace(' ', '_').str.lower().str.replace('candidatus_', '')


In [72]:
# Standardize mOTUs 3

df = pd.read_csv('raw_databases/mOTUs3/db_mOTU_taxonomy_CAMI.tsv', skiprows=1, sep='\t', header=None)
df = df.iloc[:, [27, 28]]
df.columns = ['Assigned TaxID', 'Taxa']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower()
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    df.iloc[i, 2] = '|'.join([str(num) if num != "NA" else prefix + '__' + name.replace('.', '') for num, name, prefix in zip(df.iloc[i, 0].split("|"), df.iloc[i, 2].split("|"), current_rank_prefixes)])
df = df[['Taxa', 'TaxID']]
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 1].rsplit('|', 1)[0] not in df['TaxID'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
mOTUs3 = df.copy()
mOTUs3 = mOTUs3[mOTUs3.TaxID.str.split('|').str.get(0).str.contains('2157$') | mOTUs3.TaxID.str.split('|').str.get(0).str.contains('2$') | mOTUs3.TaxID.str.split('|').str.get(0).str.contains('10239$')]
mOTUs3 = mOTUs3.drop_duplicates()
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace('|', ';')
mOTUs3['Taxa'] = 'k__' + mOTUs3['Taxa'].astype(str)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|p__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|c__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|o__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|f__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|g__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|s__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(' ', '_')
mOTUs3.to_csv('standardized_databases/mOTUs3.tsv', sep='\t', index=None)

  df = pd.read_csv('raw_databases/mOTUs3/db_mOTU_taxonomy_CAMI.tsv', skiprows=1, sep='\t', header=None)
  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower()
  mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace('|', ';')


In [63]:
# Standardize PhyloPhlAn 3

df = pd.read_csv('raw_databases/PhyloPhlAn 3/SGB.Jul20.txt', skiprows=1, sep='\t')
df = df.loc[:, ['Assigned taxonomy', 'Assigned taxonomic ID']]
df.columns = ['Taxa', 'Assigned TaxID']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower()
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    df.iloc[i, 2] = '|'.join([str(num) if num != "" else prefix + '__' + name for num, name, prefix in zip(df.iloc[i, 1].split("|"), df.iloc[i, 2].split("|"), current_rank_prefixes)])

df = df[['Taxa', 'TaxID']]
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 1].rsplit('|', 1)[0] not in df['TaxID'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
phylophlan3 = df.copy()
phylophlan3 = phylophlan3[phylophlan3.TaxID.str.split('|').str.get(0).str.contains('2157$') | phylophlan3.TaxID.str.split('|').str.get(0).str.contains('2$') | phylophlan3.TaxID.str.split('|').str.get(0).str.contains('10239$')]
phylophlan3 = phylophlan3.drop_duplicates()
phylophlan3.to_csv('standardized_databases/PhyloPhlAn3.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower()


In [73]:
def get_taxa_list(prefix, file_name):
    tool_names = ['centrifuge', 'gtdbtk', 'kraken', 'metaphlan2', 'metaphlan3', 'metaphlan4', 'metaxa2', 'mOTUs3', 'phylophlan3']
    for_df = {}
    for df, name in zip([centrifuge, gtdbtk, kraken, metaphlan2, metaphlan3, metaphlan4, metaxa2, mOTUs3, phylophlan3],
                       tool_names):
        taxa_list = df['Taxa'].tolist()
        tax_id_list = df['TaxID'].tolist()
        tmp = [tax_id for taxa, tax_ids in zip(taxa_list, tax_id_list) for name, tax_id in zip(str(taxa).split('|'), str(tax_ids).split('|')) if prefix in name]
        tmp = list(set(tmp))
        if prefix == 'k__':
            allowed_kingdoms = [2, 2157, 10239]
            tmp = [item for item in tmp if int(item) in allowed_kingdoms]
        tmp = [str(item).lower() for item in tmp]
        for_df[name] = tmp
    for_df['key'] = sorted(list(set([item for sub_list in for_df.values() for item in sub_list])))
    for tool_name in tool_names:
        for_df[tool_name] = [str(tax_id).lower() if tax_id in for_df[tool_name] else '' for tax_id in for_df['key']]
    pd.DataFrame.from_dict(for_df).to_csv(file_name, sep='\t', index=False)
    


In [74]:
get_taxa_list('k__', 'taxa_lists/kingdom.tsv')
get_taxa_list('p__', 'taxa_lists/phylum.tsv')
get_taxa_list('c__', 'taxa_lists/class.tsv')
get_taxa_list('o__', 'taxa_lists/order.tsv')
get_taxa_list('f__', 'taxa_lists/family.tsv')
get_taxa_list('g__', 'taxa_lists/genus.tsv')
get_taxa_list('s__', 'taxa_lists/species.tsv')