In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
load_dotenv()
root = os.environ.get("root_folder")

tax_train= pd.read_csv(root + '/train/taxtable.csv')

tax_test= pd.read_csv(root + '/test/taxtable.csv')

In [9]:
df = tax_train
df.insert(0, "Root", ["root"]*df.shape[0], True)
df

Unnamed: 0,Root,Domain,Phylum,Class,Order,Family,Genus,Species
0,root,k__Archaea,p__,c__,o__,f__,g__,s__
1,root,k__Archaea,p__Candidatus_Korarchaeota,c__,o__,f__,g__,s__
2,root,k__Archaea,p__Crenarchaeota,c__Thermoprotei,o__,f__,g__,s__
3,root,k__Archaea,p__Crenarchaeota,c__Thermoprotei,o__Acidilobales,f__Acidilobaceae,g__Acidilobus,s__Acidilobus_saccharovorans
4,root,k__Archaea,p__Crenarchaeota,c__Thermoprotei,o__Acidilobales,f__Caldisphaeraceae,g__Caldisphaera,s__Caldisphaera_lagunensis
...,...,...,...,...,...,...,...,...
256,root,k__Archaea,p__Euryarchaeota,c__Thermoplasmata,o__Thermoplasmatales,f__Picrophilaceae,g__Picrophilus,s__Picrophilus_torridus
257,root,k__Archaea,p__Euryarchaeota,c__Thermoplasmata,o__Thermoplasmatales,f__Thermoplasmataceae,g__Thermoplasma,s__Thermoplasma_acidophilum
258,root,k__Archaea,p__Euryarchaeota,c__Thermoplasmata,o__Thermoplasmatales,f__Thermoplasmataceae,g__Thermoplasma,s__Thermoplasma_volcanium
259,root,k__Archaea,p__Thaumarchaeota,c__,o__,f__,g__,s__


Convert a dataframe into tree-like dictionary

In [10]:
node_to_children = {}

#iterate over dataframe row-wise. Assuming that every row stands for one complete branch of the tree
for row in df.itertuples():
    #remove index at position 0 and elements that contain no child ("")
    row_list = [element for element in row[1:] if element != ""]
    for i in range(len(row_list)-1):
        if row_list[i] in node_to_children.keys():
            #parent entry already existing 
            if row_list[i+1] in node_to_children[row_list[i]].keys():
                #entry itself already existing --> next
                continue
            else:
                #entry not existing --> update dict and add the connection
                node_to_children[row_list[i]].update({row_list[i+1]:0})
        else:
            #add the branching point
            node_to_children[row_list[i]] = {row_list[i+1]:0}

In [11]:
def newickify(node_to_children, root_node) -> str:
    visited_nodes = set()

    def newick_render_node(name) -> str:
        #assert name not in visited_nodes, "Error: The tree may not be circular!"

        if name not in node_to_children:
            # Leafs
            return F'{name}'
        else:
            # Nodes
            visited_nodes.add(name)
            children = node_to_children[name]
            children_strings = [newick_render_node(child) for child in children.keys()]
            children_strings = ",".join(children_strings)
            return F'({children_strings}){name}'

    newick_string = newick_render_node(root_node) + ';'

    # Ensure no entries in the dictionary are left unused.
    #assert visited_nodes == set(node_to_children.keys()), "Error: some nodes aren't in the tree"

    return newick_string


taxonomy_newick = newickify(node_to_children, root_node='root')
print(taxonomy_newick)

(((((((s__)g__)f__)o__)c__)p__,(((((s__)g__)f__)o__)c__)p__Candidatus_Korarchaeota,(((((s__)g__)f__)o__,(((s__Acidilobus_saccharovorans)g__Acidilobus)f__Acidilobaceae,((s__Caldisphaera_lagunensis)g__Caldisphaera)f__Caldisphaeraceae)o__Acidilobales,(((s__)g__,(s__Aeropyrum_camini)g__Aeropyrum,(s__Desulfurococcus_amylolyticus,s__Desulfurococcus_mucosus)g__Desulfurococcus,(s__Ignicoccus_hospitalis)g__Ignicoccus,(s__Staphylothermus_hellenicus,s__Staphylothermus_marinus)g__Staphylothermus,(s__Thermogladius_cellulolyticus)g__Thermogladius,(s__Thermosphaera_aggregans)g__Thermosphaera)f__Desulfurococcaceae,((s__Hyperthermus_butylicus)g__Hyperthermus,(s__Pyrodictium_delaneyi,s__Pyrodictium_occultum)g__Pyrodictium,(s__Pyrolobus_fumarii)g__Pyrolobus)f__Pyrodictiaceae)o__Desulfurococcales,(((s__)g__,(s__Acidianus_hospitalis)g__Acidianus,(s__Metallosphaera_cuprina,s__Metallosphaera_sedula,s__Metallosphaera_yellowstonensis)g__Metallosphaera,(s__,s__Sulfolobus_acidocaldarius,s__Sulfolobus_islandicus,