⚠️ If you are mounting your google drive in Colab, run the following cell.

In [52]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports 

In [53]:
import pandas as pd
import numpy as np

In [55]:
# Provide path to Kraken2 output
kraken_path = f"output"
kraken_path

'/content/drive/MyDrive/fyp code/Tools Outputs/kraken2/output fa/simlord/bin_10/v2'

In [None]:
label_path = f"label_path"
label_path

'/content/drive/MyDrive/fyp code/Tools Outputs/minimap2/simlord/bin_10/v2'

1. Kraken2 output

In [56]:
kraken_df = pd.read_csv(f'{kraken_path}/kraken_final.csv')
kraken_df.head()

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name,species,genus
0,seq1,C,1613,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus
1,seq2,C,1279,G,Staphylococcus,unknown,Staphylococcus
2,seq3,C,1280,S,Staphylococcus aureus,Staphylococcus aureus,Staphylococcus
3,seq4,C,1613,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus
4,seq5,C,59201,S1,Salmonella enterica subsp. enterica,Salmonella enterica,Salmonella


In [57]:
kraken_df.rename(columns={'species':'name_kraken'},inplace=True)

In [58]:
kraken_df.shape

(500000, 7)

2. Ground Truth

In [60]:
truth_df = pd.read_csv(f'{label_path}/ground_truth.csv')
truth_df.head()

Unnamed: 0,seq_id,name
0,seq1,Lactobacillus_fermentum_complete_genome
1,seq2,Staphylococcus_aureus_chromosome
2,seq3,Staphylococcus_aureus_chromosome
3,seq4,Lactobacillus_fermentum_complete_genome
4,seq5,Salmonella_enterica_complete_genome


In [61]:
truth_df.shape

(532408, 2)

In [62]:
# rename 'name' column
truth_df.rename(columns={'name':'ground_truth'},inplace=True)

In [64]:
truth_df.drop_duplicates(subset=['seq_id'], inplace=True)

In [65]:
truth_df.shape

(500000, 2)

### Merge outputs

In [66]:
combined_df = pd.merge(truth_df,kraken_df, on='seq_id', how='left')

In [67]:
combined_df.columns

Index(['seq_id', 'name_minimap', 'status', 'taxonomy_id', 'taxon', 'name',
       'name_kraken', 'genus'],
      dtype='object')

In [68]:
combined_df.shape

(500000, 8)

In [69]:
combined_df.drop(['status', 'taxonomy_id'], axis=1,inplace=True)

In [70]:
combined_df.head(50)

Unnamed: 0,seq_id,name_minimap,taxon,name,name_kraken,genus
0,seq1,Lactobacillus_fermentum_complete_genome,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus
1,seq2,Staphylococcus_aureus_chromosome,G,Staphylococcus,unknown,Staphylococcus
2,seq3,Staphylococcus_aureus_chromosome,S,Staphylococcus aureus,Staphylococcus aureus,Staphylococcus
3,seq4,Lactobacillus_fermentum_complete_genome,S,Limosilactobacillus fermentum,Limosilactobacillus fermentum,Limosilactobacillus
4,seq5,Salmonella_enterica_complete_genome,S1,Salmonella enterica subsp. enterica,Salmonella enterica,Salmonella
5,seq6,BS.pilon.polished.v3.ST170922,S,Bacillus spizizenii,Bacillus spizizenii,Bacillus
6,seq7,Staphylococcus_aureus_chromosome,S,Staphylococcus aureus,Staphylococcus aureus,Staphylococcus
7,seq8,Listeria_monocytogenes_complete_genome,G,Listeria,unknown,Listeria
8,seq9,Enterococcus_faecalis_complete_genome,S,Enterococcus faecalis,Enterococcus faecalis,Enterococcus
9,seq10,Salmonella_enterica_complete_genome,S1,Salmonella enterica subsp. enterica,Salmonella enterica,Salmonella


In [71]:
combined_df['name_kraken'] = combined_df['name_kraken'].apply(lambda x:x.strip())
combined_df['ground_truth'] = combined_df['ground_truth'].apply(lambda x:x.strip())

In [72]:
combined_df[['ground_truth']].value_counts()

name_minimap                           
Listeria_monocytogenes_complete_genome     50039
Cryptococcus neoformans                    50009
BS.pilon.polished.v3.ST170922              50000
Lactobacillus_fermentum_complete_genome    50000
Pseudomonas_aeruginosa_complete_genome     50000
Salmonella_enterica_complete_genome        50000
Enterococcus_faecalis_complete_genome      49961
Staphylococcus_aureus_chromosome           49879
Saccharomyces cerevisiae                   49217
Escherichia_coli_chromosome                48907
Escherichia_coli_plasmid                    1093
None                                         774
Staphylococcus_aureus_plasmid1               121
dtype: int64

In [73]:
combined_df[['name_kraken']].value_counts(sort=True).head(40)

name_kraken                  
unknown                          54525
Cryptococcus neoformans          49987
Limosilactobacillus fermentum    49949
Enterococcus faecalis            48970
Bacillus spizizenii              48701
Salmonella enterica              45495
Saccharomyces cerevisiae         44524
Staphylococcus aureus            44487
Pseudomonas aeruginosa           44143
Escherichia coli                 32584
Listeria monocytogenes           23428
Listeria innocua                  2112
Actinomyces oris                  1397
Bacillus wiedmannii                977
Listeria welshimeri                560
Shigella flexneri                  463
Listeria seeligeri                 420
Escherichia albertii               416
Listeria ivanovii                  399
Escherichia fergusonii             295
Shigella dysenteriae               278
Escherichia marmotae               212
Bacillus subtilis                  207
Staphylococcus argenteus           191
Shigella boydii                   

## Mapping subspecies

Some label names can contain subspecies names. They have to be renamed with species name since the classification is done in species level.\
Place those species names in following list.

In [78]:
genomes = [
    'Saccharomyces_cerevisiae',
    'Cryptococcus_neoformans',
    'Escherichia_coli',
    'Staphylococcus_aureus'
]

all_species = list(set(combined_df['ground_truth']))

In [79]:
mapping = {}

for genome in genomes:
  for species in all_species:
    if genome in species:
      mapping[species] = genome

In [80]:
mapping

{'CP006167.2_Saccharomyces_cerevisiae_YJM1307_chromosome_XVI_sequence': 'Saccharomyces_cerevisiae',
 'CP005636.2_Saccharomyces_cerevisiae_YJM1307_chromosome_XV_sequence': 'Saccharomyces_cerevisiae',
 'CP005031.2_Saccharomyces_cerevisiae_YJM1307_chromosome_IX_sequence': 'Saccharomyces_cerevisiae',
 'CP006075.2_Saccharomyces_cerevisiae_YJM1307_chromosome_VIII_sequence': 'Saccharomyces_cerevisiae',
 'CP006391.1_Saccharomyces_cerevisiae_YJM1307_chromosome_XII_sequence': 'Saccharomyces_cerevisiae',
 'CP004640.2_Saccharomyces_cerevisiae_YJM1307_chromosome_II_genomic_sequence': 'Saccharomyces_cerevisiae',
 'CP006287.1_Saccharomyces_cerevisiae_YJM1307_chromosome_III': 'Saccharomyces_cerevisiae',
 'CP005144.1_Saccharomyces_cerevisiae_YJM1307_chromosome_X_sequence': 'Saccharomyces_cerevisiae',
 'CP005339.2_Saccharomyces_cerevisiae_YJM1307_chromosome_XI_sequence': 'Saccharomyces_cerevisiae',
 'CP004458.2_Saccharomyces_cerevisiae_YJM1307_chromosome_I_genomic_sequence': 'Saccharomyces_cerevisiae',


In [81]:
combined_df['ground_truth'] = combined_df['ground_truth'].replace(mapping)

In [82]:
combined_df[['ground_truth']].value_counts()

ground_truth                                   
BS.pilon.polished.v3.ST170922                      50000
Cryptococcus_neoformans                            50000
Enterococcus_faecalis_complete_genome_2.845Mb      50000
Escherichia_coli                                   50000
Lactobacillus_fermentum_complete_genome_1.905Mb    50000
Listeria_monocytogenes_complete_genome_2.992Mb     50000
Pseudomonas_aeruginosa_complete_genome_6,792Mb     50000
Saccharomyces_cerevisiae                           50000
Salmonella_enterica_complete_genome_4.760Mb        50000
Staphylococcus_aureus                              50000
dtype: int64

In [84]:
combined_df[['name_kraken']].value_counts().head(20)

name_kraken                  
unknown                          54525
Cryptococcus neoformans          49987
Limosilactobacillus fermentum    49949
Enterococcus faecalis            48970
Bacillus spizizenii              48701
Salmonella enterica              45495
Saccharomyces cerevisiae         44524
Staphylococcus aureus            44487
Pseudomonas aeruginosa           44143
Escherichia coli                 32584
Listeria monocytogenes           23428
Listeria innocua                  2112
Actinomyces oris                  1397
Bacillus wiedmannii                977
Listeria welshimeri                560
Shigella flexneri                  463
Listeria seeligeri                 420
Escherichia albertii               416
Listeria ivanovii                  399
Escherichia fergusonii             295
dtype: int64

# Mapping names

This is done when Kraken2 names and ground truth label names does not match.\
For example, **Lactobacillus_fermentum_complete_genome** and **Limosilactobacillus fermentum** is the same species.


In [85]:
kraken_map = {
    'Lactobacillus_fermentum_complete_genome_1.905Mb' : 'Limosilactobacillus fermentum',
    'Enterococcus_faecalis_complete_genome_2.845Mb' : 'Enterococcus faecalis',
    'Enterococcus_faecalis_complete_genome_2.845Mb' : 'Enterococcus faecalis',
    'Escherichia_coli': 'Escherichia coli',
    'Listeria_monocytogenes_complete_genome_2.992Mb' : 'Listeria monocytogenes',
    'Pseudomonas_aeruginosa_complete_genome_6,792Mb' : 'Pseudomonas aeruginosa',
    'Salmonella_enterica_complete_genome_4.760Mb' : 'Salmonella enterica',
    'Staphylococcus_aureus' : 'Staphylococcus aureus',
    'BS.pilon.polished.v3.ST170922' : 'Bacillus subtilis',
    'Saccharomyces_cerevisiae' : 'Saccharomyces cerevisiae',
    'Cryptococcus_neoformans' : 'Cryptococcus neoformans'
}

In [86]:
combined_df['ground_truth'] = combined_df['ground_truth'].replace(kraken_map)

In [87]:
mapping_df = combined_df[['ground_truth','name_kraken']].value_counts().to_frame()
mapping_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0
ground_truth,name_kraken,Unnamed: 2_level_1
Cryptococcus neoformans,Cryptococcus neoformans,49986
Limosilactobacillus fermentum,Limosilactobacillus fermentum,49949
Enterococcus faecalis,Enterococcus faecalis,48945
Bacillus subtilis,Bacillus spizizenii,48701
Salmonella enterica,Salmonella enterica,45315
...,...,...
Pseudomonas aeruginosa,Pseudomonas koreensis,1
Pseudomonas aeruginosa,Pseudomonas juntendi,1
Pseudomonas aeruginosa,Pseudomonas iranensis,1
Pseudomonas aeruginosa,Pseudomonas graminis,1


In [88]:
combined_df[combined_df['name_kraken'] == 'unknown']

Unnamed: 0,seq_id,name_minimap,taxon,name,name_kraken,genus,ground_truth
1,seq2,Staphylococcus_aureus_chromosome,G,Staphylococcus,unknown,Staphylococcus,Staphylococcus aureus
7,seq8,Listeria_monocytogenes_complete_genome,G,Listeria,unknown,Listeria,Listeria monocytogenes
20,seq21,Listeria_monocytogenes_complete_genome,G,Listeria,unknown,Listeria,Listeria monocytogenes
30,seq31,Pseudomonas_aeruginosa_complete_genome,G,Pseudomonas,unknown,Pseudomonas,Pseudomonas aeruginosa
39,seq40,Listeria_monocytogenes_complete_genome,G,Listeria,unknown,Listeria,Listeria monocytogenes
...,...,...,...,...,...,...,...
499955,seq499956,Listeria_monocytogenes_complete_genome,G,Listeria,unknown,Listeria,Listeria monocytogenes
499961,seq499962,Escherichia_coli_chromosome,F,Enterobacteriaceae,unknown,unknown,Escherichia coli
499982,seq499983,Listeria_monocytogenes_complete_genome,G,Listeria,unknown,Listeria,Listeria monocytogenes
499985,seq499986,Pseudomonas_aeruginosa_complete_genome,P,Pseudomonadota,unknown,unknown,Pseudomonas aeruginosa


In [89]:
combined_df.shape

(500000, 7)

In [90]:
combined_df.to_csv(f'{kraken_path}/kraken_minimap.csv',index=None)