⚠️ If you are mounting your google drive in Colab, run the following cell.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [4]:
path = f'output'

In [5]:
# columns of kraken output report
columns_=['status','seq_id','taxonomy_id','length','mapping']

In [6]:
results_df = pd.read_csv(f'{path}/output.txt',delimiter='\t',header=None, names=columns_)

In [7]:
results_df

Unnamed: 0,status,seq_id,taxonomy_id,length,mapping
0,C,08628297-d792-4b1b-8d58-40e7232f28d0,1637,5435,0:138 1637:5 0:30 1637:1 0:34 1637:1 0:8 1637:...
1,C,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,96241,5331,0:120 1386:5 96241:3 0:61 1386:5 0:5 1386:5 0:...
2,C,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,28901,5034,0:102 1:1 0:16 590:5 0:30 1:5 0:3 1:2 0:81 289...
3,C,108c2d07-003c-468a-a896-20ed550cabe3,287,5091,0:1169 287:1 0:695 286:4 0:890 135621:4 0:93 1...
4,C,580a4fd7-b45b-4397-b86f-44edd7302ebc,1280,5061,0:49 1279:5 0:181 1279:1 0:35 1279:3 0:5 1279:...
...,...,...,...,...,...
179239,C,283ad856-e124-4bae-bdaa-1eb62d3d6486,1639,5787,0:67 1637:3 0:105 1637:2 0:13 1637:2 0:5 1637:...
179240,C,a6d5187d-cc1b-4fbb-9387-87aad260d47c,562,30950,0:109 543:1 0:76 543:3 0:42 543:1 0:101 543:5 ...
179241,C,a6741036-5494-4df6-a42c-7174d639d50c,287,18026,0:320 286:3 0:8 286:5 0:3 286:2 0:112 286:8 0:...
179242,C,9a8fa5ea-bb75-44de-9789-4289e804c35f,287,29085,0:47 287:4 0:5 287:3 0:1 286:1 0:10 286:7 0:5 ...


In [9]:
train_idx = np.load(f'{path}/train_idx.npy')
train_idx.shape

(173603,)

In [10]:
results_df = results_df.iloc[train_idx,:]

In [11]:
results_df

Unnamed: 0,status,seq_id,taxonomy_id,length,mapping
1,C,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,96241,5331,0:120 1386:5 96241:3 0:61 1386:5 0:5 1386:5 0:...
2,C,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,28901,5034,0:102 1:1 0:16 590:5 0:30 1:5 0:3 1:2 0:81 289...
3,C,108c2d07-003c-468a-a896-20ed550cabe3,287,5091,0:1169 287:1 0:695 286:4 0:890 135621:4 0:93 1...
4,C,580a4fd7-b45b-4397-b86f-44edd7302ebc,1280,5061,0:49 1279:5 0:181 1279:1 0:35 1279:3 0:5 1279:...
5,C,ec8812f3-39cc-430f-952a-0d458027d892,1280,5222,0:75 1279:7 0:32 1279:5 0:9 1279:2 0:5 1279:1 ...
...,...,...,...,...,...
179239,C,283ad856-e124-4bae-bdaa-1eb62d3d6486,1639,5787,0:67 1637:3 0:105 1637:2 0:13 1637:2 0:5 1637:...
179240,C,a6d5187d-cc1b-4fbb-9387-87aad260d47c,562,30950,0:109 543:1 0:76 543:3 0:42 543:1 0:101 543:5 ...
179241,C,a6741036-5494-4df6-a42c-7174d639d50c,287,18026,0:320 286:3 0:8 286:5 0:3 286:2 0:112 286:8 0:...
179242,C,9a8fa5ea-bb75-44de-9789-4289e804c35f,287,29085,0:47 287:4 0:5 287:3 0:1 286:1 0:10 286:7 0:5 ...


In [12]:
mapping_list = []

for _, row in results_df.iterrows():
    seq_id = row['seq_id']
    taxonomic_id = row['taxonomy_id']
    mapping = row['mapping']
    mapping_dict = {}

    # Splitting each entry on ':'
    mappings = mapping.split(' ')
    if mappings[-1] == '':
      mappings=mappings.pop()

    for i in range(0, len(mappings)):
        left_val, right_val = mappings[i].split(':')
        # Adding the count to the corresponding key in the dictionary
        mapping_dict[left_val] = mapping_dict.get(left_val, 0) + int(right_val)

    list_row = [seq_id, taxonomic_id, mapping_dict]

    # Calculate the weight
    total_mapping_sum = sum(mapping_dict.values())
    weight = mapping_dict.get(str(taxonomic_id), 0) / total_mapping_sum if total_mapping_sum > 0 else 0

    list_row.append(weight)
    mapping_list.append(list_row)

In [13]:
weights_df = pd.DataFrame(mapping_list, columns=['seq_id', 'taxonomic_id', 'mapping', 'weight'])

In [14]:
len(mapping_list)

173603

In [15]:
mapping_list[0]

['86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b',
 96241,
 {'0': 5041,
  '1386': 161,
  '96241': 72,
  '1385': 2,
  '653685': 16,
  '2685834': 3,
  '186817': 2},
 0.013592599584670568]

In [16]:
weights_df

Unnamed: 0,seq_id,taxonomic_id,mapping,weight
0,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,96241,"{'0': 5041, '1386': 161, '96241': 72, '1385': ...",0.013593
1,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,28901,"{'0': 4835, '1': 101, '590': 5, '28901': 23, '...",0.004600
2,108c2d07-003c-468a-a896-20ed550cabe3,287,"{'0': 5032, '287': 6, '286': 10, '135621': 9}",0.001186
3,580a4fd7-b45b-4397-b86f-44edd7302ebc,1280,"{'0': 4802, '1279': 212, '1280': 13}",0.002586
4,ec8812f3-39cc-430f-952a-0d458027d892,1280,"{'0': 4931, '1279': 235, '1280': 22}",0.004241
...,...,...,...,...
173598,283ad856-e124-4bae-bdaa-1eb62d3d6486,1639,"{'0': 5517, '1637': 221, '1639': 4, '1239': 5,...",0.000695
173599,a6d5187d-cc1b-4fbb-9387-87aad260d47c,562,"{'0': 29441, '543': 1295, '1202450': 2, '2': 1...",0.001714
173600,a6741036-5494-4df6-a42c-7174d639d50c,287,"{'0': 16958, '286': 877, '57480': 1, '287': 70...",0.003891
173601,9a8fa5ea-bb75-44de-9789-4289e804c35f,287,"{'0': 27464, '287': 181, '286': 1263, '312306'...",0.006230


In [17]:
weights = weights_df['weight'].values.reshape(-1, 1)

# Initialize the StandardScaler
scaler = MinMaxScaler()

# Fit and transform the weights
weights_df['weight_standardized'] = scaler.fit_transform(weights)

In [18]:
weights_df

Unnamed: 0,seq_id,taxonomic_id,mapping,weight,weight_standardized
0,86fd9e27-f495-4b6e-8124-0c3dbdcd2c9b,96241,"{'0': 5041, '1386': 161, '96241': 72, '1385': ...",0.013593,0.176872
1,62cf08b0-4463-479f-b041-f4cdbaa1c3ed,28901,"{'0': 4835, '1': 101, '590': 5, '28901': 23, '...",0.004600,0.059857
2,108c2d07-003c-468a-a896-20ed550cabe3,287,"{'0': 5032, '287': 6, '286': 10, '135621': 9}",0.001186,0.015439
3,580a4fd7-b45b-4397-b86f-44edd7302ebc,1280,"{'0': 4802, '1279': 212, '1280': 13}",0.002586,0.033650
4,ec8812f3-39cc-430f-952a-0d458027d892,1280,"{'0': 4931, '1279': 235, '1280': 22}",0.004241,0.055180
...,...,...,...,...,...
173598,283ad856-e124-4bae-bdaa-1eb62d3d6486,1639,"{'0': 5517, '1637': 221, '1639': 4, '1239': 5,...",0.000695,0.009047
173599,a6d5187d-cc1b-4fbb-9387-87aad260d47c,562,"{'0': 29441, '543': 1295, '1202450': 2, '2': 1...",0.001714,0.022307
173600,a6741036-5494-4df6-a42c-7174d639d50c,287,"{'0': 16958, '286': 877, '57480': 1, '287': 70...",0.003891,0.050626
173601,9a8fa5ea-bb75-44de-9789-4289e804c35f,287,"{'0': 27464, '287': 181, '286': 1263, '312306'...",0.006230,0.081072


In [19]:
sample_weights = weights_df['weight_standardized'].to_numpy()
sample_weights.shape

(173603,)

In [20]:
np.save(f'{path}/sample_weights.npy',sample_weights)