In [2]:


import pandas as pd


decoded_consensus_df = pd.read_csv("decoded_consensus.tsv", sep='\t') # This is the decoded consensus 
squiggle_database_df = pd.read_pickle("squiggle_database.pkl") # All the reads with the squiggles
read_id_barcoded_df = pd.read_pickle("read_id_barcoded.pkl") # Read ids and their barcodes

print(decoded_consensus_df.head())
print(squiggle_database_df.head())
print(read_id_barcoded_df.head())

""" For Each Unique ONT Barcode and HW_Address pair - how many read ids do we have?
ont_unique = read_id_barcoded_df['ONT_Barcode'].unique()
hw_address_unique = read_id_barcoded_df['HW_Address'].unique()

for i in ont_unique:
    for j in hw_address_unique:
        df = read_id_barcoded_df.loc[(read_id_barcoded_df['ONT_Barcode'] == i) & (read_id_barcoded_df['HW_Address'] == j)]
        print(df.shape[0])

About 8000 reads per unique pair of ONT and HW, can select like 100 to begin with to see how our model does 
"""


    

   ONT_Barcode                     HW_address   Payload1   Payload2  \
0            1  barcode_external01_internal01  [1,3,4,5]  [1,2,7,8]   
1            1  barcode_external02_internal01  [3,4,7,8]  [2,3,4,8]   
2            1  barcode_external03_internal01  [1,3,5,8]  [2,4,5,6]   
3            1  barcode_external04_internal01  [1,4,5,8]  [3,4,7,8]   
4            1  barcode_external05_internal01  [2,4,5,6]  [3,4,6,7]   

    Payload3   Payload4   Payload5   Payload6   Payload7   Payload8  
0  [1,4,5,6]  [1,4,7,8]  [1,5,7,8]  [1,3,5,6]  [1,3,5,7]  [1,5,6,7]  
1  [2,4,6,7]  [1,6,7,8]  [1,2,5,8]  [1,2,7,8]  [1,5,6,8]  [1,5,6,7]  
2  [1,5,6,8]  [1,2,4,6]  [2,5,6,8]  [2,4,7,8]  [1,4,5,7]  [2,5,7,8]  
3  [1,4,7,8]  [1,4,7,8]  [1,5,6,7]  [1,2,4,7]  [1,2,3,4]  [2,4,6,7]  
4  [4,5,6,8]  [1,4,5,8]  [4,5,6,8]  [1,2,4,6]  [2,4,7,8]  [4,5,7,8]  
                                read_id  \
0  56523433-648a-4f12-9690-63485bfcfe7a   
1  c30c3c1c-c233-47c0-b271-bfa557b5cf52   
2  c44ca947-aa22-4e4a-9c

" For Each Unique ONT Barcode and HW_Address pair - how many read ids do we have?\nont_unique = read_id_barcoded_df['ONT_Barcode'].unique()\nhw_address_unique = read_id_barcoded_df['HW_Address'].unique()\n\nfor i in ont_unique:\n    for j in hw_address_unique:\n        df = read_id_barcoded_df.loc[(read_id_barcoded_df['ONT_Barcode'] == i) & (read_id_barcoded_df['HW_Address'] == j)]\n        print(df.shape[0])\n\nAbout 8000 reads per unique pair of ONT and HW, can select like 100 to begin with to see how our model does \n"

Let us see whether we can continue to use this for at least this session and do some analysis

Firstly, let's select two squiggles of the same set of ONT and HW and see how much they differ

In [90]:

import numpy as np
import matplotlib.pyplot as plt

def get_label(ONT_Barcode, HW_Address):
    label_row = decoded_consensus_df.loc[(decoded_consensus_df['ONT_Barcode'] == ONT_Barcode) & (decoded_consensus_df['HW_address'] == HW_Address)]
    payload_columns = label_row.columns[2:]
    label = [label_row[i].to_numpy()[0] for i in payload_columns]
    label_str = ""
    for i in label:
        label_str += " " +i
    
    return label_str.replace('[', '').replace(']', '').replace(',', '')

def get_read_id(ONT_Barcode, HW_Address, sample_length=10):
    read_id_rows = read_id_barcoded_df.loc[(read_id_barcoded_df['ONT_Barcode'] == ONT_Barcode) & (read_id_barcoded_df['HW_Address'] == HW_Address)]
    read_ids = read_id_rows['read_id'].to_numpy()
    samples = [np.random.choice(len(read_ids)) for i in range(sample_length)]
    return read_ids[samples]

def get_squiggle(read_id):
    return squiggle_database_df.loc[squiggle_database_df['read_id'] == read_id]['squiggle'].to_numpy()
    
print(get_label(ONT_Barcode, HW_Address))

labels = []
ont_unique = read_id_barcoded_df['ONT_Barcode'].unique()
hw_address_unique = read_id_barcoded_df['HW_Address'].unique()

onts = []
hws = []
squiggles = []

for i in ont_unique:
    for j in hw_address_unique:
        label = get_label(i,j)
        
        read_id = get_read_id(i,j, sample_length=1)
        squiggle = get_squiggle(read_id[0])
        squiggles.append(squiggle)
        onts.append(i)
        hws.append(j)
    break

dataset_df = pd.DataFrame()
dataset_df['ONT_Barcode'] = onts
dataset_df['HW_Address'] = hws
dataset_df['squiggle'] = squiggles
dataset_df['label'] = label

print(dataset_df.head())

 3478 2348 2467 1678 1258 1278 1568 1567
   ONT_Barcode                     HW_Address  \
0            1  barcode_external01_internal01   
1            1  barcode_external01_internal02   
2            1  barcode_external01_internal03   
3            1  barcode_external01_internal04   
4            1  barcode_external01_internal05   

                                            squiggle  \
0  [[539, 409, 426, 422, 444, 430, 430, 451, 446,...   
1  [[529, 528, 521, 556, 521, 527, 504, 519, 545,...   
2  [[526, 529, 528, 556, 538, 540, 547, 550, 526,...   
3  [[429, 448, 459, 466, 488, 465, 505, 516, 506,...   
4  [[550, 559, 563, 572, 552, 544, 569, 570, 578,...   

                                      label  
0   2357 1278 2367 1378 2678 3578 1578 1238  
1   2357 1278 2367 1378 2678 3578 1578 1238  
2   2357 1278 2367 1378 2678 3578 1578 1238  
3   2357 1278 2367 1378 2678 3578 1578 1238  
4   2357 1278 2367 1378 2678 3578 1578 1238  


Looks like it can be quite noisy but it represents the thing that we want. Let's just arbitary select one, make a dataset - do some cleaning and train something

Port this code to vs and get my first dataset hell yeah