In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re

In [3]:
# Mention dataset, bin count and version for experiment
# Note: bin_count is only applicable for simulated datasets. For real datasets, it is assign None.
dataset = 'simlord'
bin_count = 10
version = 2

bins = bin_count if dataset == 'simlord' else None

In [4]:
# Define the path to the csv file generated using the kraken2 tool
filepath = f"/content/drive/MyDrive/FYP/FYP/Tools Outputs/kraken2/output fa/{dataset}{'/bin_'+str(bins) if bins != None else '/'}/v{version}"
filepath

'/content/drive/MyDrive/FYP/FYP/Tools Outputs/kraken2/output fa/simlord/bin_10/v2'

In [5]:
# Specify the taxonomic level to be considered for the analysis
# Options: 'G', 'S' for Genus, Species respectively
level = 'S' 

In [6]:
kraken_df = pd.read_csv(f'{filepath}/kraken2.csv')
kraken_df.head()

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name
0,seq1,C,1613,S,Limosilactobacillus fermentum
1,seq2,C,1279,G,Staphylococcus
2,seq3,C,1280,S,Staphylococcus aureus
3,seq4,C,1613,S,Limosilactobacillus fermentum
4,seq5,C,59201,S1,Salmonella enterica subsp. e...


# Get taxonomic levels

In [7]:
# columns of kraken output report
columns = ['percentage', 'count', 'coverage', 'taxon', 'taxonomy_id', 'name']

In [8]:
# read the report file
df = pd.read_table(f'{filepath}/report.txt',header=None,names=columns)

In [9]:
df.shape

(2303, 6)

In [10]:
df.head()

Unnamed: 0,percentage,count,coverage,taxon,taxonomy_id,name
0,0.03,158,158,U,0,unclassified
1,99.97,499842,197,R,1,root
2,99.93,499644,346,R1,131567,cellular organisms
3,80.63,403161,236,D,2,Bacteria
4,50.6,253002,118,D1,1783272,Terrabacteria group


### Species Map

In [11]:
species_map = { }

In [12]:
idx = 0
while idx < df.shape[0]:
  taxon = df.iloc[idx,-3]
  name = df.iloc[idx,-1].strip()
  if taxon == 'S':
    species = name
    species_map[name] = species
  if bool(re.match(r'^S\d', taxon)):
    # print(name)
    species_map[name] = species
  idx+=1

In [13]:
species_map

{'Bacillus spizizenii': 'Bacillus spizizenii',
 'Bacillus spizizenii TU-B-10': 'Bacillus spizizenii',
 'Bacillus spizizenii ATCC 6633 = JCM 2499': 'Bacillus spizizenii',
 'Bacillus spizizenii str. W23': 'Bacillus spizizenii',
 'Bacillus subtilis': 'Bacillus subtilis',
 'Bacillus subtilis subsp. subtilis': 'Bacillus subtilis',
 'Bacillus subtilis subsp. subtilis str. 168': 'Bacillus subtilis',
 'Bacillus subtilis subsp. subtilis str. BSP1': 'Bacillus subtilis',
 'Bacillus subtilis subsp. subtilis NCIB 3610 = ATCC 6051 = DSM 10': 'Bacillus subtilis',
 'Bacillus subtilis PY79': 'Bacillus subtilis',
 'Bacillus velezensis': 'Bacillus velezensis',
 'Bacillus velezensis NJN-6': 'Bacillus velezensis',
 'Bacillus velezensis TrigoCor1448': 'Bacillus velezensis',
 'Bacillus amyloliquefaciens': 'Bacillus amyloliquefaciens',
 'Bacillus amyloliquefaciens KHG19': 'Bacillus amyloliquefaciens',
 'Bacillus amyloliquefaciens DSM 7 = ATCC 23350': 'Bacillus amyloliquefaciens',
 'Bacillus halotolerans': 'Ba

### Genus Map

In [14]:
genus_map = { }

In [15]:
idx = 0
while idx < df.shape[0]:
  taxon = df.iloc[idx,-3]
  name = df.iloc[idx,-1].strip()
  if taxon == 'G':
    genus = name
    genus_map[name] = genus
  if bool(re.match(r'^G\d', taxon)) or taxon.startswith('S'):
    # print(name)
    genus_map[name] = genus
  idx+=1

In [16]:
genus_map

{'Bacillus': 'Bacillus',
 'Bacillus subtilis group': 'Bacillus',
 'Bacillus spizizenii': 'Bacillus',
 'Bacillus spizizenii TU-B-10': 'Bacillus',
 'Bacillus spizizenii ATCC 6633 = JCM 2499': 'Bacillus',
 'Bacillus spizizenii str. W23': 'Bacillus',
 'Bacillus subtilis': 'Bacillus',
 'Bacillus subtilis subsp. subtilis': 'Bacillus',
 'Bacillus subtilis subsp. subtilis str. 168': 'Bacillus',
 'Bacillus subtilis subsp. subtilis str. BSP1': 'Bacillus',
 'Bacillus subtilis subsp. subtilis NCIB 3610 = ATCC 6051 = DSM 10': 'Bacillus',
 'Bacillus subtilis PY79': 'Bacillus',
 'Bacillus amyloliquefaciens group': 'Bacillus',
 'Bacillus velezensis': 'Bacillus',
 'Bacillus velezensis NJN-6': 'Bacillus',
 'Bacillus velezensis TrigoCor1448': 'Bacillus',
 'Bacillus amyloliquefaciens': 'Bacillus',
 'Bacillus amyloliquefaciens KHG19': 'Bacillus',
 'Bacillus amyloliquefaciens DSM 7 = ATCC 23350': 'Bacillus',
 'Bacillus mojavensis subgroup': 'Bacillus',
 'Bacillus halotolerans': 'Bacillus',
 'Bacillus mojave

# Kraken Labels

In [17]:
kraken_df['name'] = kraken_df['name'].apply(lambda x:x.strip())

In [18]:
# Label species classified up to the specified level as it as, else label as unknown
kraken_df['species'] = kraken_df['name'].map(species_map).fillna('unknown')
kraken_df['genus'] = kraken_df['name'].map(genus_map).fillna('unknown')
kraken_df

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name,species,genus
0,seq1,C,1613,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus
1,seq2,C,1279,G,Staphylococcus,unknown,Staphylococcus
2,seq3,C,1280,S,Staphylococcus aureus,Staphylococcus aureus,Staphylococcus
3,seq4,C,1613,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus
4,seq5,C,59201,S1,Salmonella enterica subsp. enterica,Salmonella enterica,Salmonella
...,...,...,...,...,...,...,...
499995,seq499996,C,559292,S1,Saccharomyces cerevisiae S288C,Saccharomyces cerevisiae,Saccharomyces
499996,seq499997,C,40410,S1,Cryptococcus neoformans var. neoformans,Cryptococcus neoformans,Cryptococcus
499997,seq499998,C,1351,S,Enterococcus faecalis,Enterococcus faecalis,Enterococcus
499998,seq499999,C,287,S,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Pseudomonas


In [19]:
filepath

'/content/drive/MyDrive/FYP/FYP/Tools Outputs/kraken2/output fa/simlord/bin_10/v2'

In [20]:
kraken_df.to_csv(f'{filepath}/kraken_final.csv',index=None)