⚠️ If you are mounting your google drive in Colab, run the following cell.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
! cp /content/drive/MyDrive/FYP/FYP/test/output/report.txt ./output/report.txt

# Imports

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
# Define the path to the csv files generated using the kraken2 tool
filepath = f"output"
filepath

'output'

In [5]:
# Specify the taxonomic level to be considered for the analysis
# Options: 'G', 'S' for Genus, Species respectively
level = 'S'

In [6]:
kraken_df = pd.read_csv(f'{filepath}/kraken2.csv')
kraken_df.head()

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name
0,08628297-d792-4b1b-8d58-40e7232f28d0,C,1637,G,Listeria
1,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,C,96241,S,Bacillus spizizenii
2,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,C,28901,S,Salmonella enterica
3,108c2d07-003c-468a-a896-20ed550cabe3,C,287,S,Pseudomonas aeruginosa
4,580a4fd7-b45b-4397-b86f-44edd7302ebc,C,1280,S,Staphylococcus aureus


# Get taxonomic levels

In [7]:
# columns of kraken output report
columns = ['percentage', 'count', 'coverage', 'taxon', 'taxonomy_id', 'name']

In [14]:
# read the report file
df = pd.read_table(f'{filepath}/report.txt',header=None,names=columns)

In [15]:
df.shape

(1421, 6)

In [16]:
df.head()

Unnamed: 0,percentage,count,coverage,taxon,taxonomy_id,name
0,0.06,99,99,U,0,unclassified
1,99.94,179145,8,R,1,root
2,99.94,179136,12,R1,131567,cellular organisms
3,95.87,171833,14,D,2,Bacteria
4,57.71,103434,6,D1,1783272,Terrabacteria group


### Species Map

In [17]:
species_map = { }

In [18]:
idx = 0
while idx < df.shape[0]:
  taxon = df.iloc[idx,-3]
  name = df.iloc[idx,-1].strip()
  if taxon == 'S':
    species = name
    species_map[name] = species
  if bool(re.match(r'^S\d', taxon)):
    # print(name)
    species_map[name] = species
  idx+=1

In [19]:
species_map

{'Bacillus spizizenii': 'Bacillus spizizenii',
 'Bacillus spizizenii TU-B-10': 'Bacillus spizizenii',
 'Bacillus spizizenii ATCC 6633 = JCM 2499': 'Bacillus spizizenii',
 'Bacillus spizizenii str. W23': 'Bacillus spizizenii',
 'Bacillus subtilis': 'Bacillus subtilis',
 'Bacillus subtilis BSn5': 'Bacillus subtilis',
 'Bacillus subtilis subsp. subtilis': 'Bacillus subtilis',
 'Bacillus subtilis subsp. natto': 'Bacillus subtilis',
 'Bacillus velezensis': 'Bacillus velezensis',
 'Bacillus velezensis AS43.3': 'Bacillus velezensis',
 'Bacillus velezensis TrigoCor1448': 'Bacillus velezensis',
 'Bacillus velezensis NJN-6': 'Bacillus velezensis',
 'Bacillus amyloliquefaciens': 'Bacillus amyloliquefaciens',
 'Bacillus siamensis': 'Bacillus siamensis',
 'Bacillus vallismortis': 'Bacillus vallismortis',
 'Bacillus halotolerans': 'Bacillus halotolerans',
 'Bacillus mojavensis': 'Bacillus mojavensis',
 'Bacillus inaquosorum': 'Bacillus inaquosorum',
 'Bacillus tequilensis': 'Bacillus tequilensis',
 

### Genus Map

In [20]:
genus_map = { }

In [21]:
idx = 0
while idx < df.shape[0]:
  taxon = df.iloc[idx,-3]
  name = df.iloc[idx,-1].strip()
  if taxon == 'G':
    genus = name
    genus_map[name] = genus
  if bool(re.match(r'^G\d', taxon)) or taxon.startswith('S'):
    # print(name)
    genus_map[name] = genus
  idx+=1

In [22]:
genus_map

{'Bacillus': 'Bacillus',
 'Bacillus subtilis group': 'Bacillus',
 'Bacillus spizizenii': 'Bacillus',
 'Bacillus spizizenii TU-B-10': 'Bacillus',
 'Bacillus spizizenii ATCC 6633 = JCM 2499': 'Bacillus',
 'Bacillus spizizenii str. W23': 'Bacillus',
 'Bacillus subtilis': 'Bacillus',
 'Bacillus subtilis BSn5': 'Bacillus',
 'Bacillus subtilis subsp. subtilis': 'Bacillus',
 'Bacillus subtilis subsp. natto': 'Bacillus',
 'Bacillus amyloliquefaciens group': 'Bacillus',
 'Bacillus velezensis': 'Bacillus',
 'Bacillus velezensis AS43.3': 'Bacillus',
 'Bacillus velezensis TrigoCor1448': 'Bacillus',
 'Bacillus velezensis NJN-6': 'Bacillus',
 'Bacillus amyloliquefaciens': 'Bacillus',
 'Bacillus siamensis': 'Bacillus',
 'Bacillus vallismortis': 'Bacillus',
 'Bacillus mojavensis subgroup': 'Bacillus',
 'Bacillus halotolerans': 'Bacillus',
 'Bacillus mojavensis': 'Bacillus',
 'Bacillus inaquosorum': 'Bacillus',
 'Bacillus tequilensis': 'Bacillus',
 'Bacillus stercoris': 'Bacillus',
 'Bacillus atrophaeu

# Kraken Labels

In [23]:
kraken_df['name'] = kraken_df['name'].apply(lambda x:x.strip())

In [24]:
# Label species classified up to the specified level as it as, else label as unknown
kraken_df['species'] = kraken_df['name'].map(species_map).fillna('unknown')
kraken_df['genus'] = kraken_df['name'].map(genus_map).fillna('unknown')
kraken_df

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name,species,genus
0,08628297-d792-4b1b-8d58-40e7232f28d0,C,1637,G,Listeria,unknown,Listeria
1,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,C,96241,S,Bacillus spizizenii,Bacillus spizizenii,Bacillus
2,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,C,28901,S,Salmonella enterica,Salmonella enterica,Salmonella
3,108c2d07-003c-468a-a896-20ed550cabe3,C,287,S,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Pseudomonas
4,580a4fd7-b45b-4397-b86f-44edd7302ebc,C,1280,S,Staphylococcus aureus,Staphylococcus aureus,Staphylococcus
...,...,...,...,...,...,...,...
179239,283ad856-e124-4bae-bdaa-1eb62d3d6486,C,1639,S,Listeria monocytogenes,Listeria monocytogenes,Listeria
179240,a6d5187d-cc1b-4fbb-9387-87aad260d47c,C,562,S,Escherichia coli,Escherichia coli,Escherichia
179241,a6741036-5494-4df6-a42c-7174d639d50c,C,287,S,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Pseudomonas
179242,9a8fa5ea-bb75-44de-9789-4289e804c35f,C,287,S,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Pseudomonas


In [25]:
filepath

'output'

In [26]:
train_idx = kraken_df[kraken_df['species'] != 'unknown'].index
test_idx = kraken_df[kraken_df['species'] == 'unknown'].index
train_idx.shape, test_idx.shape

((173603,), (5641,))

In [31]:
labels = kraken_df['species'].to_numpy()
labels.shape

(179244,)

In [32]:
kraken_df.to_csv(f'{filepath}/kraken_final.csv',index=None)
np.save(f'{filepath}/train_idx.npy',train_idx)
np.save(f'{filepath}/test_idx.npy',test_idx)
np.save(f'{filepath}/labels.npy',labels)