In [1]:
import os
import pickle
from collections import defaultdict

In [2]:
project_dir = "/d/hpc/home/tcurk/tkopac/genome-embedding"
data_dir = os.path.join(project_dir, "data", "deepmicrobes_mag_reads")

## Taxonomy data preparation

Just like with our other taxonomy datasets, a dictionaries of organisms' taxa and taxa's one-hot encoding indices have to be prepared, before the data can be used to train taxonomy classifiers.

In [3]:
organism_taxa = defaultdict(dict)
ranks = ["phylum", "class", "order", "family", "genus", "species"]
taxa_index = {r: {} for r in ranks}
rank_counters = {r: 0 for r in ranks}

In [4]:
with open(os.path.join(data_dir, "metadata_gut-derived_MAGs.txt"), "rt") as in_file:
    in_file.readline()
    in_file.readline()
    for line in in_file:
        fields = line.strip().split("\t")
        org_id = fields[0]
        species_id = fields[3]
        phylum, cls, order, family, genus = fields[7:12]

        # if any of organism's taxa are unknown, skip it
        if phylum == "Unassigned" or cls == "Unassigned" or order == "Unassigned" or family == "Unassigned" or genus == "Unassigned":
            continue

        # store organism's taxa at all available ranks
        organism_taxa[org_id] = {"phylum": phylum, "class": cls, "order": order,
                                 "family": family, "genus": genus, "species": species_id}

        # add new taxa to the taxa index
        for rank in ranks:
            taxon = organism_taxa[org_id][rank]
            if taxon not in taxa_index[rank]:
                taxa_index[rank][taxon] = rank_counters[rank]
                rank_counters[rank] += 1

organism_taxa = dict(organism_taxa)

In [5]:
len(organism_taxa)

2550

In [6]:
for rank in ranks:
    print(f"No. of \"{rank}\" taxa: {len(set([taxa[rank] for taxa in organism_taxa.values()]))}")

No. of "phylum" taxa: 8
No. of "class" taxa: 13
No. of "order" taxa: 17
No. of "family" taxa: 29
No. of "genus" taxa: 47
No. of "species" taxa: 159


organism_taxa dictionary holds information about each organism's taxa at different taxonomic ranks.

In [7]:
organism_taxa["MAG-GUT1024"]

{'phylum': 'Firmicutes',
 'class': 'Bacilli',
 'order': 'Lactobacillales',
 'family': 'Lactobacillaceae',
 'genus': 'Lactobacillus',
 'species': '228'}

taxa_index dictionary is used for one-hot encoding target classes.

In [8]:
taxa_index["phylum"]

{'Actinobacteria': 0,
 'Firmicutes': 1,
 'Bacteroidetes': 2,
 'Verrucomicrobia': 3,
 'Proteobacteria': 4,
 'Tenericutes': 5,
 'Spirochaetes': 6,
 'Fusobacteria': 7}

In [9]:
with open(os.path.join(data_dir, "organism_taxa.pkl"), "wb") as in_file:
    pickle.dump(organism_taxa, in_file)
with open(os.path.join(data_dir, "taxa_index.pkl"), "wb") as in_file:
    pickle.dump(taxa_index, in_file)