# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
# Mention dataset, bin count and version for experiment
# Note: bin_count is only applicable for simulated datasets. For real datasets, it is assigned to None.
dataset = 'simlord'
bin_count = 10
version = 2

bins = bin_count if dataset == 'simlord' else None

In [3]:
# Provide the path to the dataset
result_path = f"/content/drive/MyDrive/FYP/FYP/datasets/{dataset}{'/bin_'+str(bins) if bins != None else ''}/v{version}"
result_path

'/content/drive/MyDrive/FYP/FYP/datasets/simlord/bin_10/v2'

In [4]:
os.listdir(result_path)

['sim_10.fasta',
 'sim_10_labels.npy',
 'train_indices_simlord_v2.npy',
 'test_indices_simlord_v2.npy']

In [5]:
# Provide the path to the kraken output
kraken_path = f"/content/drive/MyDrive/FYP/FYP/Tools Outputs/kraken2/output fa/{dataset}{'/bin_'+str(bins) if bins != None else ''}/v{version}"
kraken_path

'/content/drive/MyDrive/FYP/FYP/Tools Outputs/kraken2/output fa/simlord/bin_10/v2'

In [6]:
os.listdir(kraken_path)

['output.txt',
 'report.txt',
 'kraken2.csv',
 'kraken_final.csv',
 'kraken_minimap.csv',
 'train_simlord_v2.csv',
 'test_simlord_v2.csv',
 'train_indices_simlord_v2.npy',
 'test_indices_simlord_v2.npy',
 'train_vecs_simlord.npy',
 'test_vecs_simlord.npy',
 'ground_truth.txt']

In [7]:
filepath = f'/content/drive/MyDrive/fyp code/Tools Outputs/kraken2/output fa/{dataset}'

# Read Data

In [8]:
result_df = pd.read_csv(f"{kraken_path}/kraken_minimap.csv")
result_df.head()

Unnamed: 0,seq_id,name_minimap,taxon,name,name_kraken,genus,ground_truth
0,seq1,Lactobacillus_fermentum_complete_genome,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus,Limosilactobacillus fermentum
1,seq2,Staphylococcus_aureus_chromosome,G,Staphylococcus,unknown,Staphylococcus,Staphylococcus aureus
2,seq3,Staphylococcus_aureus_chromosome,S,Staphylococcus aureus,Staphylococcus aureus,Staphylococcus,Staphylococcus aureus
3,seq4,Lactobacillus_fermentum_complete_genome,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus,Limosilactobacillus fermentum
4,seq5,Salmonella_enterica_complete_genome,S1,Salmonella enterica subsp. enterica,Salmonella enterica,Salmonella,Salmonella enterica


In [9]:
result_df.shape

(500000, 7)

In [10]:
result_df['name_kraken'].value_counts().head(20)

unknown                          54525
Cryptococcus neoformans          49987
Limosilactobacillus fermentum    49949
Enterococcus faecalis            48970
Bacillus spizizenii              48701
Salmonella enterica              45495
Saccharomyces cerevisiae         44524
Staphylococcus aureus            44487
Pseudomonas aeruginosa           44143
Escherichia coli                 32584
Listeria monocytogenes           23428
Listeria innocua                  2112
Actinomyces oris                  1397
Bacillus wiedmannii                977
Listeria welshimeri                560
Shigella flexneri                  463
Listeria seeligeri                 420
Escherichia albertii               416
Listeria ivanovii                  399
Escherichia fergusonii             295
Name: name_kraken, dtype: int64

# Filter

In [18]:
filter_1 = (result_df['name_kraken'] != 'unknown').to_numpy()
# filter_2 = (result_df['name_kraken'] == result_df['ground_truth']).to_numpy()

In [33]:
train_df = result_df[filter_1]
test_df = result_df[~filter_1]

train_df.shape, test_df.shape

((445475, 7), (54525, 7))

In [34]:
train_df.shape[0] + test_df.shape[0]

500000

In [35]:
train_df.index

Int64Index([     0,      2,      3,      4,      5,      6,      8,      9,
                10,     11,
            ...
            499989, 499990, 499991, 499992, 499994, 499995, 499996, 499997,
            499998, 499999],
           dtype='int64', length=445475)

In [36]:
test_df.index

Int64Index([     1,      7,     20,     30,     39,     57,     62,     69,
                91,     94,
            ...
            499913, 499918, 499926, 499945, 499946, 499955, 499961, 499982,
            499985, 499993],
           dtype='int64', length=54525)

In [37]:
result_path

'/content/drive/MyDrive/FYP/FYP/datasets/simlord/bin_10/v2'

In [38]:
np.save(f"{result_path}/train_indices_{dataset}_v{version}.npy",train_df.index,)
np.save(f"{result_path}/test_indices_{dataset}_v{version}.npy",test_df.index,)