# Large Dataset Training

In [1]:
import os
import json
from stelaro.data import synthetic

DATA_DIRECTORY = "../data/"
SUMMARY_DIRECTORY = DATA_DIRECTORY + "ncbi_genome_summaries/"
NCBI_TAXONOMY_DIRECTORY = DATA_DIRECTORY + "ncbi_taxonomy/"
DATASET_V1_DIRECTORY = DATA_DIRECTORY + "version_1/"
DATASET_V1_COMPRESSED_DIRECTORY = DATASET_V1_DIRECTORY + "compressed/"


def mkdir(path: str) -> None:
    """Create a directory if it does not exist."""
    if not os.path.exists(path):
        os.makedirs(path)


mkdir(DATA_DIRECTORY)
mkdir(DATASET_V1_DIRECTORY)
mkdir(DATASET_V1_COMPRESSED_DIRECTORY)

## 1. Compress The Dataset

In [2]:
LENGTH = 1500
with open("../datasets/version_1_splits/map.json", "r") as f:
    index_to_taxonomic_label = json.load(f)

for dataset_name in ("validate", ):#"test", "train"):
    with open(f"../datasets/version_1_splits/{dataset_name}.json", "r") as f:
        dataset = json.load(f)
    index_to_n_passes = {}
    directory = DATASET_V1_COMPRESSED_DIRECTORY + dataset_name + "/"
    mkdir(directory)
    synthetic.compress_dataset(
        dataset,
        index_to_taxonomic_label,
        "../data/version_1/genomes/",
        LENGTH,
        directory
    )

100%|██████████| 27/27 [06:18<00:00, 14.00s/it]


## 2. Sample Data

In [2]:
n = synthetic.get_n_reads_in_compressed_dataset(
    DATASET_V1_COMPRESSED_DIRECTORY + "validate/",
)
ids = synthetic.get_random_identifiers(n)

In [None]:
x, y = synthetic.sample_compressed_dataset(
    DATASET_V1_COMPRESSED_DIRECTORY + "validate/",
    10,
    1500,
    ids,
    0
)

0 0 353601 353601
1 0 769004 769004
2 0 107359 107359
3 0 345987 345987
4 0 530637 530637
5 0 1243784 1243784
6 0 25346 25346
7 0 1209618 1209618
8 0 1228558 1228558
9 0 665434 665434
0 1000000 353601 -646399
1 1000000 769004 -230996
2 1000000 107359 -892641
3 1000000 345987 -654013
4 1000000 530637 -469363
5 1000000 1243784 243784
6 1000000 25346 -974654
7 1000000 1209618 209618
8 1000000 1228558 228558
9 1000000 665434 -334566
[ 7 16  2  6  7 22  1 21 22 12]
[[ 69  89 139 ...  98  58 235]
 [129 239 121 ... 217  91 160]
 [114 227 143 ...  66  18 223]
 ...
 [148 174  85 ...  90  22 134]
 [227  82 140 ... 158 177 253]
 [191   7 243 ... 169 253 244]]


In [None]:
with open(f"../datasets/version_1_splits/train.json", "r") as f:
    dataset = json.load(f)


taxon = dataset[-1]
print(taxon[0])
references = []
for element in taxon[1]:
    genus, ref = element
    references += ref
print(synthetic.evaluate_n_nucleotides(references))


['Viruses', 'Monodnaviria']
5158834
