In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re

In [9]:
# Mention dataset, bin count and version for experiment
# Note: bin_count is only applicable for simulated datasets. For real datasets, it is assign None.
dataset = 'simlord'
bin_count = 10
version = 2

bins = bin_count if dataset == 'simlord' else None

In [10]:
# Define path to Kraken2 classification results
result_path = f"/content/drive/MyDrive/FYP/FYP/Tools Outputs/kraken2/output fa/{dataset}{'/bin_'+str(bins) if bins != None else '/'}/v{version}"
result_path

'/content/drive/MyDrive/FYP/FYP/Tools Outputs/kraken2/output fa/simlord/bin_10/v2'

## Analysis of report

In [11]:
# columns of kraken output report
columns = ['percentage', 'count', 'coverage', 'taxon', 'taxonomy_id', 'name']

In [12]:
# read the report file
df = pd.read_table(f'{result_path}/report.txt',header=None,names=columns)

In [13]:
df.shape

(2303, 6)

In [14]:
df.head()

Unnamed: 0,percentage,count,coverage,taxon,taxonomy_id,name
0,0.03,158,158,U,0,unclassified
1,99.97,499842,197,R,1,root
2,99.93,499644,346,R1,131567,cellular organisms
3,80.63,403161,236,D,2,Bacteria
4,50.6,253002,118,D1,1783272,Terrabacteria group


## Analysis of classification result

In [15]:
# columns of kraken output report
columns_=['status','seq_id','taxonomy_id','length','mapping']

In [16]:
results_df = pd.read_csv(f'{result_path}/output.txt',delimiter='\t',header=None, names=columns_)

In [17]:
results_df.head()

Unnamed: 0,status,seq_id,taxonomy_id,length,mapping
0,C,seq1,1613,5000,0:78 1613:4 0:3 1613:2 0:13 1613:5 0:9 1613:2 ...
1,C,seq2,1279,5000,0:187 1279:5 0:489 1279:1 0:444 1279:2 0:52 12...
2,C,seq3,1280,5000,0:121 1279:3 0:24 1279:2 0:178 1279:6 0:12 127...
3,C,seq4,1613,5000,0:490 1613:5 0:2 1613:5 0:79 2:5 0:114 1613:5 ...
4,C,seq5,59201,5000,0:61 590:3 0:57 590:1 0:233 543:2 0:6 590:2 0:...


In [18]:
results_df.shape

(500000, 5)

In [19]:
# Merge the two dataframes using taxonomy_id
merged_df = pd.merge(results_df[['seq_id','status','taxonomy_id']],df[['taxonomy_id','taxon','name']],on='taxonomy_id',how='left')

In [20]:
merged_df.head(20)

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name
0,seq1,C,1613,S,Limosilactobacillus fermentum
1,seq2,C,1279,G,Staphylococcus
2,seq3,C,1280,S,Staphylococcus aureus
3,seq4,C,1613,S,Limosilactobacillus fermentum
4,seq5,C,59201,S1,Salmonella enterica subsp. e...
5,seq6,C,96241,S,Bacillus spizizenii
6,seq7,C,1280,S,Staphylococcus aureus
7,seq8,C,1637,G,Listeria
8,seq9,C,1351,S,Enterococcus faecalis
9,seq10,C,59201,S1,Salmonella enterica subsp. e...


In [21]:
merged_df.to_csv(f'{result_path}/kraken2.csv',index=False)

# Analysis of Results

In [22]:
# columns of kraken output report
columns = ['percentage', 'count', 'coverage', 'taxon', 'taxonomy_id', 'name']

In [23]:
# read the report file
df = pd.read_table(f'{result_path}/report.txt',header=None,names=columns)

In [24]:
df.head()

Unnamed: 0,percentage,count,coverage,taxon,taxonomy_id,name
0,0.03,158,158,U,0,unclassified
1,99.97,499842,197,R,1,root
2,99.93,499644,346,R1,131567,cellular organisms
3,80.63,403161,236,D,2,Bacteria
4,50.6,253002,118,D1,1783272,Terrabacteria group


In [25]:
df.shape

(2303, 6)

In [None]:
# columns of kraken output report
columns_=['status','seq_id','taxonomy_id','length','mapping']

In [None]:
results_df = pd.read_csv(f'{result_path}/classification_results.txt',delimiter='\t',header=None, names=columns_)

In [None]:
results_df.head()

Unnamed: 0,status,seq_id,taxonomy_id,length,mapping
0,C,4d4262d4-c552-4b8b-a09f-fc9f58c6e283,1279,503,0:55 1279:3 0:223 1279:5 0:20 1279:1 0:32 1279...
1,C,cd4133d1-f92f-44d0-a85c-3ae93f036256,1613,272,0:69 1613:1 0:25 1613:1 0:5 1613:1 0:37 1613:6...
2,C,fcf98c1d-38ee-4b11-a243-460abb9d6733,1613,390,0:52 2:1 0:7 2:5 0:103 1613:1 0:5 1613:2 0:66 ...
3,U,91694d0d-293e-418e-ab8b-fd2ecb53610b,0,235,0:201
4,C,a955709e-96d8-450d-8991-d9cce436d288,1279,237,0:99 1279:5 0:20 1279:5 0:2 1279:2 0:70


In [None]:
# Merge the two dataframes using taxonomy_id
merged_df = pd.merge(results_df[['seq_id','status','taxonomy_id']],df[['taxonomy_id','taxon','name']],on='taxonomy_id',how='left')

In [None]:
merged_df.head(20)

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name
0,4d4262d4-c552-4b8b-a09f-fc9f58c6e283,C,1279,G,Staphylococcus
1,cd4133d1-f92f-44d0-a85c-3ae93f036256,C,1613,S,Limosilactobacillus fermentum
2,fcf98c1d-38ee-4b11-a243-460abb9d6733,C,1613,S,Limosilactobacillus fermentum
3,91694d0d-293e-418e-ab8b-fd2ecb53610b,U,0,U,unclassified
4,a955709e-96d8-450d-8991-d9cce436d288,C,1279,G,Staphylococcus
5,0dfc8a31-5140-473e-b707-d6dd314f3347,C,1613,S,Limosilactobacillus fermentum
6,e6353177-62b7-4f0f-9464-b80b81851aa3,U,0,U,unclassified
7,9f6f6c5f-80e7-4335-aae2-b7f4666c4e3a,C,33958,F,Lactobacillaceae
8,1c09cf9d-c569-4489-818e-8c0555afc2cb,C,543,F,Enterobacteriaceae
9,43edd670-7e2a-4ea7-8d2f-6879a5f7d93a,C,1613,S,Limosilactobacillus fermentum


In [None]:
merged_df.to_csv(f'{result_path}/kraken2.csv',index=False)