In [1]:
import os
import re
os.chdir('..')

In [2]:
import pandas as pd
nodes = pd.read_csv('ncbi_taxdump/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]
names = pd.read_csv('ncbi_taxdump/names.dmp', sep='\t', header=None).iloc[:, [0,2,6]]
nodes.columns = ['TaxID', 'ParentID', 'Rank']
names.columns = ['TaxID', 'Name', 'Name_type']
names['Name'] = names['Name'].str.lower().str.replace('candidatus ', '')
ncbi_table = names.merge(nodes, 'left', on='TaxID')
tax_levels_dict = {'superkingdom': 'k', 'kingdom':'k', 'phylum':'p', 'class':'c', 'order':'o', 'family':'f', 'genus':'g', 'species':'s'}

def rename_taxa(row):
    if row['Rank'] in tax_levels_dict.keys():
        return tax_levels_dict[row['Rank']] + '__' + row['Name']
    else:
        return row['Name']
    
ncbi_table['Name'] = ncbi_table.apply(lambda row: rename_taxa(row), axis=1)
ncbi_table.loc[~ncbi_table['Name'].str.contains('__'), 'TaxID'] = ''
ncbi_table = ncbi_table[['TaxID', 'Name']]
ncbi_table.to_csv('standardized_databases/NCBI_table.tsv', sep='\t', index=None)

  nodes = pd.read_csv('ncbi_taxdump/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]


In [3]:
# Standardize Centrifuge

seqid2taxid = pd.read_csv('raw_databases/Centrifuge/seqid2taxid.map', delim_whitespace=True, header=None)
seqid2taxid.columns = ['seqid', 'taxid']
nodes = pd.read_csv('raw_databases/Centrifuge/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]
names = pd.read_csv('raw_databases/Centrifuge/names.dmp', sep='\t', header=None).iloc[:, [0,2,6]]
nodes.columns = ['TaxID', 'ParentID', 'Rank']
names.columns = ['TaxID', 'Name', 'Name_type']
names['Name'] = names['Name'].str.lower().str.replace('candidatus ', '')
names = names[names['Name_type'] == 'scientific name']

id_parent_dict = {key: value for key, value in zip(nodes['TaxID'].tolist(), nodes['ParentID'].tolist())}
rank_dict = {key: value for key, value in zip(nodes['TaxID'].tolist(), nodes['Rank'].tolist())}
full_taxid_list = list(set(seqid2taxid['taxid'].tolist()))
print(len(full_taxid_list))

i = 0

while i < len(full_taxid_list):
    current_id = int(str(full_taxid_list[i]).split('|')[0])
    try:
        if id_parent_dict[current_id] != 1 and id_parent_dict[current_id] != 131567:
            parent_id = id_parent_dict[current_id]
            while rank_dict[parent_id] not in tax_levels_dict.keys():
                parent_id = id_parent_dict[parent_id]
                if parent_id == 1 or parent_id == 131567:
                    raise ValueError("Broken ID")
            if parent_id not in set(full_taxid_list):
                full_taxid_list.append(parent_id)
            full_taxid_list[i] = str(parent_id) + '|' + str(full_taxid_list[i])
        else:
            i += 1
            if i % 1000 == 0:
                print(str(i/len(full_taxid_list) * 100) + '% done')
    except: # TaxID not found
        del full_taxid_list[i]

taxa_table = names.merge(nodes, 'left', on='TaxID')
def rename_taxa(row):
    if row['Rank'] in tax_levels_dict.keys():
        return tax_levels_dict[row['Rank']] + '__' + row['Name']
    else:
        return ''
taxa_table['Name'] = taxa_table.apply(lambda row: rename_taxa(row), axis=1)


taxid_to_name_dict = {key: value for key, value in zip(taxa_table['TaxID'].tolist(), taxa_table['Name'].tolist())}
taxa_list = ['|'.join([taxid_to_name_dict[int(single_taxid)].replace(' ', '_') for single_taxid in str(taxid).split('|') if taxid_to_name_dict[int(single_taxid)] != '']) for taxid in full_taxid_list]
        
centrifuge = pd.DataFrame.from_dict({'Taxa':taxa_list, 'TaxID':full_taxid_list})
centrifuge = centrifuge.astype(str)
centrifuge = centrifuge[centrifuge.TaxID.str.split('|').str.get(0).str.contains('2157$') | centrifuge.TaxID.str.split('|').str.get(0).str.contains('2$') | centrifuge.TaxID.str.split('|').str.get(0).str.contains('10239$')]
centrifuge.drop_duplicates()
centrifuge.to_csv('standardized_databases/Centrifuge.tsv', sep='\t', index=None)


  nodes = pd.read_csv('raw_databases/Centrifuge/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]


21194
4.504098729844158% done
8.637443316778233% done
12.619358095318217% done
16.305233980107616% done
19.87360387932748% done
23.348120476301656% done
26.648393482564337% done
29.918845132577882% done
32.985156679494224% done
35.92599245554158% done
38.85141101260905% done
41.72316678835924% done
44.47181171319102% done
47.112666576928255% done
50.12028869286287% done
52.69745076081944% done
54.973483378605614% done
57.32666645434568% done
59.66025057305241% done
61.94251734390485% done
64.2339338696357% done
66.64848980581054% done
69.14797667007396% done
71.66960312957266% done
74.2170105388155% done
76.81626141164652% done
79.39074950748332% done
82.03685798833905% done
84.49144888267342% done
87.00696055684455% done
89.55913792107239% done
92.08368104514977% done
94.44762449914138% done
96.45663706771823% done
98.78076315195304% done


In [4]:
rank_prefixes = ['k', 'p', 'c', 'o', 'f', 'g', 's', 't']
conversion_dict = {key.replace(' ', '_').replace('.', ''): str(value) for key, value in zip(ncbi_table['Name'], ncbi_table['TaxID'])}

# Standardize GTDB-Tk

df = pd.read_csv('raw_databases/GTDB-Tk/gtdb_taxonomy.tsv', sep='\t', header=None)
df.columns = ['Junk', 'Taxa']
df = df[['Taxa']]
df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace(".__", "").str.lower().str.replace('_[a-z]\|', '|').str.replace(' ', '_').str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 0].rsplit(';', 1)[0] not in df['Taxa'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit(';', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    if i % 1000 == 0:
        print(str(i/len(df.index) * 100) + '% done')
    
gtdbtk = df.copy()
gtdbtk['Taxa'] = gtdbtk['Taxa'].str.replace(';', '|').str.replace('d__', 'k__').str.replace(' ', '_')
gtdbtk = gtdbtk[gtdbtk.TaxID.str.split('|').str.get(0).str.contains('2157$') | gtdbtk.TaxID.str.split('|').str.get(0).str.contains('2$') | gtdbtk.TaxID.str.split('|').str.get(0).str.contains('10239$')]
gtdbtk = gtdbtk.drop_duplicates()
gtdbtk.to_csv('standardized_databases/GTDBTk.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace(".__", "").str.lower().str.replace('_[a-z]\|', '|').str.replace(' ', '_').str.replace('candidatus_', '')


1.5140962359567574% done
3.006162633398467% done
4.484036828889155% done
5.947513196044904% done
7.3942620526471465% done
8.8519075860848% done
10.309582020088957% done
11.770764364010889% done
13.215664968208984% done
14.672653109135195% done
16.115563238935202% done
17.472081070456166% done
18.913217429257294% done
20.329040033688123% done
21.74417256175345% done
23.15752909164592% done
24.52890081666811% done
25.917180210793067% done
27.29924280521272% done
28.664383070815358% done
29.886006233367013% done
31.157059906528822% done
32.30654699197955% done
33.58099315787265% done
34.798585785473676% done
35.9931336175868% done
37.20853315693732% done
38.46470862983213% done
39.75271072363641% done
41.059892696813755% done
42.27407235684772% done
43.50367742023193% done
44.77004477004477% done
46.034905289952206% done
47.263446450515175% done
48.51555867013463% done
49.79342457641945% done
51.008765453642425% done
52.13137105505875% done
53.18441696582901% done
54.1475719436337% done
5

In [5]:
# Standardize Kraken

seqid2taxid = pd.read_csv('raw_databases/Kraken 2 Bracken 2/seqid2taxid.map', sep='\t', header=None)
seqid2taxid.columns = ['seqid', 'taxid']
nodes = pd.read_csv('raw_databases/Kraken 2 Bracken 2/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]
names = pd.read_csv('raw_databases/Kraken 2 Bracken 2/names.dmp', sep='\t', header=None).iloc[:, [0,2,6]]
nodes.columns = ['TaxID', 'ParentID', 'Rank']
names.columns = ['TaxID', 'Name', 'Name_type']
names['Name'] = names['Name'].str.lower().str.replace('candidatus ', '')
names = names[names['Name_type'] == 'scientific name']

id_parent_dict = {key: value for key, value in zip(nodes['TaxID'].tolist(), nodes['ParentID'].tolist())}
rank_dict = {key: value for key, value in zip(nodes['TaxID'].tolist(), nodes['Rank'].tolist())}
full_taxid_list = list(set(seqid2taxid['taxid'].tolist()))
print(len(full_taxid_list))

i = 0

while i < len(full_taxid_list):
    current_id = int(str(full_taxid_list[i]).split('|')[0])
    try:
        if id_parent_dict[current_id] != 1 and id_parent_dict[current_id] != 131567:
            parent_id = id_parent_dict[current_id]
            while rank_dict[parent_id] not in tax_levels_dict.keys():
                parent_id = id_parent_dict[parent_id]
                if parent_id == 1 or parent_id == 131567:
                    raise ValueError("Broken ID")
            if parent_id not in set(full_taxid_list):
                full_taxid_list.append(parent_id)
            full_taxid_list[i] = str(parent_id) + '|' + str(full_taxid_list[i])
        else:
            i += 1
            if i % 1000 == 0:
                print(str(i/len(full_taxid_list) * 100) + '% done')
    except: # TaxID not found
        del full_taxid_list[i]

taxa_table = names.merge(nodes, 'left', on='TaxID')
def rename_taxa(row):
    if row['Rank'] in tax_levels_dict.keys():
        return tax_levels_dict[row['Rank']] + '__' + row['Name']
    else:
        return ''
taxa_table['Name'] = taxa_table.apply(lambda row: rename_taxa(row), axis=1)


taxid_to_name_dict = {key: value for key, value in zip(taxa_table['TaxID'].tolist(), taxa_table['Name'].tolist())}
taxa_list = ['|'.join([taxid_to_name_dict[int(single_taxid)].replace(' ', '_') for single_taxid in str(taxid).split('|') if taxid_to_name_dict[int(single_taxid)] != '']) for taxid in full_taxid_list]
taxa_list
        
kraken = pd.DataFrame.from_dict({'Taxa':taxa_list, 'TaxID':full_taxid_list})
kraken = kraken.astype(str)
kraken = kraken[kraken.TaxID.str.split('|').str.get(0).str.contains('2157$') | kraken.TaxID.str.split('|').str.get(0).str.contains('2$') | kraken.TaxID.str.split('|').str.get(0).str.contains('10239$')]
kraken.drop_duplicates()
kraken.to_csv('standardized_databases/KrakenBracken.tsv', sep='\t', index=None)


  nodes = pd.read_csv('raw_databases/Kraken 2 Bracken 2/nodes.dmp', sep='\t', header=None).iloc[:, [0,2,4]]


23639
4.068679306697046% done
7.858854964831624% done
11.507921285818405% done
15.055139448229138% done
18.437258010988604% done
21.776213116539033% done
24.98126405196103% done
28.0849569949096% done
31.181789834736513% done
34.135518006485746% done
37.00339758468732% done
39.793076004775166% done
42.60479140038672% done
45.323577972741106% done
47.941702889286624% done
50.61529214513934% done
53.41880341880342% done
55.83299730140513% done
58.22327092207274% done
60.6501698204755% done
63.011972274732194% done
65.2644694295292% done
67.63114561279698% done
69.84459577440195% done
72.23972028780304% done
74.52846414034283% done
76.94061324518408% done
79.42135867252873% done
81.83306055646482% done
84.30281571404485% done
86.67449533076106% done
89.07198129488393% done
91.49130832570906% done
93.8474702586326% done
95.81953075806938% done
98.00985543546324% done


In [6]:
# Standardize MetaPhlAn 2
df = pd.read_csv('raw_databases/MetaPhlAn 2/MPA2Taxonomy.csv', header=None)
df.columns = ['Taxa']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
                
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 0].rsplit('|', 1)[0] not in df['Taxa'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaphlan2 = df.copy()
# Select ABV
metaphlan2 = metaphlan2[metaphlan2.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaphlan2.TaxID.str.split('|').str.get(0).str.contains('2$') | metaphlan2.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaphlan2 = metaphlan2.drop_duplicates()
metaphlan2.to_csv('standardized_databases/MetaPhlAn2.tsv', sep='\t', index=None)


  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')


In [7]:
# Standardize MetaPhlAn 3

df = pd.read_csv('raw_databases/MetaPhlAn 3/MPA3Taxonomy.csv', header=None)
df.columns = ['Taxa']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
                
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 0].rsplit('|', 1)[0] not in df['Taxa'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaphlan3 = df.copy()
metaphlan3 = metaphlan3[metaphlan3.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaphlan3.TaxID.str.split('|').str.get(0).str.contains('2$') | metaphlan3.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaphlan3 = metaphlan3.drop_duplicates()
metaphlan3.to_csv('standardized_databases/MetaPhlAn3.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')


In [8]:
# Standardize MetaPhlAn 4

df = pd.read_csv('raw_databases/MetaPhlAn 4/MPA4Taxonomy.csv', header=None)
df.columns = ['Taxa']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')
                
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 0].rsplit('|', 1)[0] not in df['Taxa'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaphlan4 = df.copy()
metaphlan4 = metaphlan4[metaphlan4.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaphlan4.TaxID.str.split('|').str.get(0).str.contains('2$') | metaphlan4.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaphlan4 = metaphlan4.drop_duplicates()
metaphlan4.to_csv('standardized_databases/MetaPhlAn4.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower().str.replace('candidatus_', '')


In [9]:
# Standardize Metaxa 2

df = pd.read_csv('raw_databases/Metaxa 2/blast.taxonomy.txt', sep='\t', header=None)
df.columns = ['Junk', 'Taxa']
df = df[['Taxa']]
df['Taxa'] = df['Taxa'].str.strip(';').str.replace(' ', '_')
df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace('\\.', '').str.replace(' ', '_').str.lower().str.replace('candidatus_', '')
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    replace_name = ''
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 1)]
    for rank_prefix, name in zip(current_rank_prefixes, full_name.split('|')):
        name = rank_prefix + '__' + name
        if name in conversion_dict.keys():
            replace_name += '|' + conversion_dict[name]
        else:
            replace_name += '|' + name
    df.iloc[i, 1] = replace_name.strip('|')

df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 0].rsplit(';', 1)[0] not in df['Taxa'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit(';', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
metaxa2 = df.copy()
metaxa2 = metaxa2[metaxa2.TaxID.str.split('|').str.get(0).str.contains('2157$') | metaxa2.TaxID.str.split('|').str.get(0).str.contains('2$') | metaxa2.TaxID.str.split('|').str.get(0).str.contains('10239$')]
metaxa2 = metaxa2.drop_duplicates()
metaxa2['Taxa'] = 'k__' + metaxa2['Taxa'].astype(str)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|p__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|c__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|o__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|f__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|g__', 1)
metaxa2['Taxa'] = metaxa2['Taxa'].str.replace(';', '|s__', 1)
metaxa2.to_csv('standardized_databases/Metaxa2.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(";", "|").str.replace('\\.', '').str.replace(' ', '_').str.lower().str.replace('candidatus_', '')


In [10]:
# Standardize mOTUs 3

df = pd.read_csv('raw_databases/mOTUs3/db_mOTU_taxonomy_CAMI.tsv', skiprows=1, sep='\t', header=None)
df = df.iloc[:, [27, 28]]
df.columns = ['Assigned TaxID', 'Taxa']
df['TaxID'] = df['Taxa'].str.lower()
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 2)]
    df.iloc[i, 2] = '|'.join([str(num) if num != "NA" else prefix + '__' + name.replace('.', '') for num, name, prefix in zip(df.iloc[i, 0].split("|"), df.iloc[i, 2].split("|"), current_rank_prefixes)])
df = df[['Taxa', 'TaxID']]
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 0].rsplit('|', 1)[0] not in df['Taxa'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    
mOTUs3 = df.copy()
mOTUs3 = mOTUs3[mOTUs3.TaxID.str.split('|').str.get(0).str.contains('2157$') | mOTUs3.TaxID.str.split('|').str.get(0).str.contains('2$') | mOTUs3.TaxID.str.split('|').str.get(0).str.contains('10239$')]
mOTUs3 = mOTUs3.drop_duplicates()
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace('|', ';')
mOTUs3['Taxa'] = 'k__' + mOTUs3['Taxa'].astype(str)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|p__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|c__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|o__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|f__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|g__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(';', '|s__', 1)
mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace(' ', '_')
mOTUs3.to_csv('standardized_databases/mOTUs3.tsv', sep='\t', index=None)

  df = pd.read_csv('raw_databases/mOTUs3/db_mOTU_taxonomy_CAMI.tsv', skiprows=1, sep='\t', header=None)
  mOTUs3['Taxa'] = mOTUs3['Taxa'].str.replace('|', ';')


In [11]:
# Standardize PhyloPhlAn 3

df = pd.read_csv('raw_databases/PhyloPhlAn 3/SGB.Jul20.txt', skiprows=1, sep='\t')
df = df.loc[:, ['Assigned taxonomy', 'Assigned taxonomic ID']]
df.columns = ['Taxa', 'Assigned TaxID']
df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower()
for i in range(len(df.index)):
    full_name = df.iloc[i, 1]
    current_rank_prefixes = rank_prefixes[0:(full_name.count('|') + 2)]
    nums = df.iloc[i, 1].split("|")
    nums.append('')
    df.iloc[i, 2] = '|'.join([str(num) if num != "" else prefix + '__' + name for num, name, prefix in zip(nums, df.iloc[i, 2].split("|"), current_rank_prefixes)])

df = df[['Taxa', 'TaxID']]
df = df.sort_values(by=['Taxa'])

i = 0
while i < len(df.index):
    if '|' in df.iloc[i, 1] and df.iloc[i, 0].rsplit('|', 1)[0] not in df['Taxa'].tolist():
        df.loc[len(df.index)] = [df.iloc[i, 0].rsplit('|', 1)[0], df.iloc[i, 1].rsplit('|', 1)[0]]        
    i += 1
    if i % 1000 == 0:
        print(str(i/len(df.index) * 100) + '% done')
    
phylophlan3 = df.copy()
phylophlan3 = phylophlan3[phylophlan3.TaxID.str.split('|').str.get(0).str.contains('2157$') | phylophlan3.TaxID.str.split('|').str.get(0).str.contains('2$') | phylophlan3.TaxID.str.split('|').str.get(0).str.contains('10239$')]
phylophlan3 = phylophlan3.drop_duplicates()
phylophlan3.to_csv('standardized_databases/PhyloPhlAn3.tsv', sep='\t', index=None)

  df['TaxID'] = df['Taxa'].str.replace(".__", "").str.lower()


0.9948367970234483% done
1.9767338426717536% done
2.9479104228286186% done
3.906936766228439% done
4.85550031075202% done
5.7904438375201455% done
6.7179147592587265% done
7.629365427530566% done
8.536874555371117% done
9.415309292910273% done
10.284122250165948% done
11.145062273035451% done
11.997821933864314% done
12.83979603066877% done
13.673281496403927% done
14.489866149861442% done
15.309384652792163% done
16.13235702698585% done
16.92032308911667% done
17.694730509254345% done
18.470794156192554% done
19.23144165879926% done
19.99339348736939% done
20.749219742882584% done
21.495576210415894% done
22.23647637374385% done
23.00241099344857% done
23.733841915660097% done
24.472160807412532% done
25.17581108071365% done
25.871925622386726% done
26.55623698121976% done
27.27295266903032% done
27.98031502542917% done
28.594070406771078% done
29.246417313879046% done
29.947389720760825% done
30.63355018662281% done
31.30443158376344% done
31.958837017920917% done
32.5854573488154% d

In [12]:
def get_taxa_list(prefix, file_name):
    tool_names = ['centrifuge', 'gtdbtk', 'kraken', 'metaphlan2', 'metaphlan3', 'metaphlan4', 'metaxa2', 'mOTUs3', 'phylophlan3']
    for_df = {}
    for df, name in zip([centrifuge, gtdbtk, kraken, metaphlan2, metaphlan3, metaphlan4, metaxa2, mOTUs3, phylophlan3],
                       tool_names):
        taxa_list = df['Taxa'].tolist()
        tax_id_list = df['TaxID'].tolist()
        tmp = [tax_id for taxa, tax_ids in zip(taxa_list, tax_id_list) for name, tax_id in zip(str(taxa).split('|'), str(tax_ids).split('|')) if prefix in name]
        tmp = list(set(tmp))
        if prefix == 'k__':
            allowed_kingdoms = [2, 2157, 10239]
            tmp = [item for item in tmp if int(item) in allowed_kingdoms]
        tmp = [str(item).lower() for item in tmp]
        for_df[name] = tmp
    for_df['key'] = sorted(list(set([item for sub_list in for_df.values() for item in sub_list])))
    for tool_name in tool_names:
        for_df[tool_name] = [str(tax_id).lower() if tax_id in for_df[tool_name] else '' for tax_id in for_df['key']]
    pd.DataFrame.from_dict(for_df).to_csv(file_name, sep='\t', index=False)
    


In [13]:
get_taxa_list('k__', 'taxa_lists/kingdom.tsv')
get_taxa_list('p__', 'taxa_lists/phylum.tsv')
get_taxa_list('c__', 'taxa_lists/class.tsv')
get_taxa_list('o__', 'taxa_lists/order.tsv')
get_taxa_list('f__', 'taxa_lists/family.tsv')
get_taxa_list('g__', 'taxa_lists/genus.tsv')
get_taxa_list('s__', 'taxa_lists/species.tsv')