⚠️ If you are mounting your google drive in Colab, run the following cell.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re

In [4]:
# Define path to Kraken2 classification results
result_path = f"output"
result_path

'output'

## Analysis of report

In [5]:
# columns of kraken output report
columns = ['percentage', 'count', 'coverage', 'taxon', 'taxonomy_id', 'name']

In [6]:
# read the report file
df = pd.read_table(f'{result_path}/report.txt',header=None,names=columns)

In [7]:
df.shape

(1421, 6)

In [8]:
df.head()

Unnamed: 0,percentage,count,coverage,taxon,taxonomy_id,name
0,0.06,99,99,U,0,unclassified
1,99.94,179145,8,R,1,root
2,99.94,179136,12,R1,131567,cellular organisms
3,95.87,171833,14,D,2,Bacteria
4,57.71,103434,6,D1,1783272,Terrabacteria group


## Analysis of classification result

In [9]:
# columns of kraken output report
columns_=['status','seq_id','taxonomy_id','length','mapping']

In [10]:
results_df = pd.read_csv(f'{result_path}/output.txt',delimiter='\t',header=None, names=columns_)

In [11]:
results_df.head()

Unnamed: 0,status,seq_id,taxonomy_id,length,mapping
0,C,08628297-d792-4b1b-8d58-40e7232f28d0,1637,5435,0:138 1637:5 0:30 1637:1 0:34 1637:1 0:8 1637:...
1,C,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,96241,5331,0:120 1386:5 96241:3 0:61 1386:5 0:5 1386:5 0:...
2,C,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,28901,5034,0:102 1:1 0:16 590:5 0:30 1:5 0:3 1:2 0:81 289...
3,C,108c2d07-003c-468a-a896-20ed550cabe3,287,5091,0:1169 287:1 0:695 286:4 0:890 135621:4 0:93 1...
4,C,580a4fd7-b45b-4397-b86f-44edd7302ebc,1280,5061,0:49 1279:5 0:181 1279:1 0:35 1279:3 0:5 1279:...


In [12]:
results_df.shape

(179244, 5)

In [13]:
# Merge the two dataframes using taxonomy_id
merged_df = pd.merge(results_df[['seq_id','status','taxonomy_id']],df[['taxonomy_id','taxon','name']],on='taxonomy_id',how='left')

In [14]:
merged_df.head(20)

Unnamed: 0,seq_id,status,taxonomy_id,taxon,name
0,08628297-d792-4b1b-8d58-40e7232f28d0,C,1637,G,Listeria
1,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,C,96241,S,Bacillus spizizenii
2,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,C,28901,S,Salmonella enterica
3,108c2d07-003c-468a-a896-20ed550cabe3,C,287,S,Pseudomonas aeruginosa
4,580a4fd7-b45b-4397-b86f-44edd7302ebc,C,1280,S,Staphylococcus aureus
5,ec8812f3-39cc-430f-952a-0d458027d892,C,1280,S,Staphylococcus aureus
6,f74001aa-1fa4-4609-a71e-32c3a56036b9,C,1280,S,Staphylococcus aureus
7,15f27e72-3921-403e-beb5-57b94870a405,C,559292,S1,Saccharomyces cere...
8,5e7c6d52-427b-4a79-b3ab-42d129fabce5,C,1280,S,Staphylococcus aureus
9,2a76382b-53f7-4a30-b3c9-aa03bb8466f0,C,96241,S,Bacillus spizizenii


In [15]:
merged_df.to_csv(f'{result_path}/kraken2.csv',index=False)