In [1]:

# First we need to examine the csv file 


## Creating the empirical dataset from the Helixworks data

Seems like the motif caller identified data is in that excel file, from which we can extract the read ids and use that to get the fast5 data. 
We seem to be limited by the performance of the motif search algorithm, so it would be interesting to compare the performance on unseen data, which is hard to create, but maybe by rerunning the motif search and seeing what we get, or evaluating the motif caller on stuff its not trained on to see if it identifies anything new. 
Given the base error problem with motif calling, this should be the case, let us see if we can prove it though.


### Examining the motif sequences from the result of motif calling

In [57]:

import pandas as pd

df = pd.read_csv(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\data\helixworks_1280_fast5_2024-02-09_1638\motif_search_barcoded.csv")

In [58]:

def library_motif_segregation():
    library_motifs = df['library_motif'].to_numpy()
    library_motifs_arr = []
    no_motifs = []
    for i in library_motifs:
        motifs_arr = i.split('|')
        library_motifs_arr.append(motifs_arr)
        no_motifs.append(len(motifs_arr))
    return library_motifs_arr, no_motifs


library_motifs_arr, motifs_discovered = library_motif_segregation()
df['library_motifs_arr'] = library_motifs_arr
df['motifs_discovered'] = motifs_discovered


df['ONT_Barcode'] = df['ONT_Barcode'].apply(lambda x: int(x[-2:]))
df.head()

Unnamed: 0,read_id,ONT_Barcode,HW_Address,orientation,start_end,library_motif,library_motifs_arr,motifs_discovered
0,0117ec74-7ef1-4e8c-b169-c8ca9a576de4,1,barcode_external01_internal01,+|+|+,75-124|370-419|566-615,ltm8_2x1|ltm8_5x4|ltm8_9x1,"[ltm8_2x1, ltm8_5x4, ltm8_9x1]",3
1,01575c8a-4a38-4554-9d88-5acbc61059a4,1,barcode_external01_internal01,+|+,176-225|275-324,ltm8_6x1|ltm8_8x7,"[ltm8_6x1, ltm8_8x7]",2
2,01ecbbe8-550d-433b-a2af-354c33c91dde,1,barcode_external01_internal01,+,108-157,ltm8_1x1,[ltm8_1x1],1
3,062cd1bc-9132-4144-bd56-5aea4e9a5529,1,barcode_external01_internal01,+|+|+|+,154-203|251-300|301-350|351-400,ltm8_2x1|ltm8_4x2|ltm8_5x5|ltm8_6x4,"[ltm8_2x1, ltm8_4x2, ltm8_5x5, ltm8_6x4]",4
4,06382a54-201e-46cf-beeb-c8d574b30e85,1,barcode_external01_internal01,+|+|+|+|+,149-198|199-248|299-348|349-398|497-546,ltm8_2x1|ltm8_3x4|ltm8_5x5|ltm8_6x8|ltm8_9x3,"[ltm8_2x1, ltm8_3x4, ltm8_5x5, ltm8_6x8, ltm8_...",5


## Encoded csv

In [59]:

import re

encoded_file_path = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\data\helixworks_1280_fast5_2024-02-09_1638\EIC01-01-1280-T1_encoded.tsv"


with open(encoded_file_path, 'r') as f:
    lines = f.readlines()

lines = [line.strip() for line in lines]

ONT_barcodes = []
HW_addresses = []
cycles = []
payloads = []

for i, line in enumerate(lines):
    if i==0:
        continue

    ONT_barcodes.append(int(line[:2]))

    matches = re.findall(r'"(.*?)"', line)
    
    local_payload = []
    for j, match in enumerate(matches):
        if j == 0:
            HW_addresses.append(match)
        else:
            local_payload.append(eval(match))

    payloads.append(local_payload)
    

encoded_df = pd.DataFrame({'ONT_Barcode': ONT_barcodes, 'HW_Address': HW_addresses, 'Payload': payloads})

## Creating dataset

In [60]:

import numpy as np
ont_barcodes = np.arange(1,77, 4)
test_barcodes = [5, 29, 53]
train_barcodes = [barcode for barcode in ont_barcodes if barcode not in test_barcodes]

In [61]:
encoded_train_df = encoded_df[encoded_df['ONT_Barcode'].isin(train_barcodes)]
encoded_test_df = encoded_df[encoded_df['ONT_Barcode'].isin(test_barcodes)]

In [7]:
train_barcodes

[np.int64(1),
 np.int64(9),
 np.int64(13),
 np.int64(17),
 np.int64(21),
 np.int64(25),
 np.int64(33),
 np.int64(37),
 np.int64(41),
 np.int64(45),
 np.int64(49),
 np.int64(57),
 np.int64(61),
 np.int64(65),
 np.int64(69),
 np.int64(73)]

In [62]:

import matplotlib.pyplot as plt
from tqdm import tqdm
# for each barcode and hw address that uniquely define a cycle - select best quality of reads 

def get_best_quality_reads(ont_barcode, hw_address, n_motifs=6):

    df_subset = df[(df['ONT_Barcode'] == ont_barcode) & (df['HW_Address'] == hw_address) & (df['motifs_discovered'] > n_motifs)]

    read_ids = df_subset['read_id'].to_numpy()
    library_motifs_arr = df_subset['library_motifs_arr'].to_numpy()
    return read_ids, library_motifs_arr


In [None]:

# Iterating over all the barcodes and hw addresses to get the best quality reads and their corresponding read ids
training_arr = []
for barcode in tqdm(train_barcodes):
    for hw_address in tqdm(encoded_df['HW_Address'].unique()):
        read_ids, library_motifs_arr = get_best_quality_reads(barcode, hw_address)
        payload = encoded_train_df[(encoded_train_df['ONT_Barcode'] == barcode) & (
            encoded_train_df['HW_Address'] == hw_address)]['Payload'].to_numpy()[0]
        for i, read_id in enumerate(read_ids):
            training_arr.append([
                barcode, hw_address, payload, library_motifs_arr[i], read_id])

In [9]:

empirical_train_df = pd.DataFrame(training_arr, columns=['ONT_Barcode', 'HW_Address', 'Payload', 'Library_Motifs', 'read_id'])

In [10]:
empirical_train_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id
0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[ltm8_2x1, ltm8_3x3, ltm8_3x5, ltm8_4x1, ltm8_...",0804c886-cd0a-4ece-87ee-adb529974699
1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[ltm8_3x5, ltm8_4x1, ltm8_4x1, ltm8_6x7, ltm8_...",0f041c54-7071-49d3-8ae2-7a1bf25525ab
2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[ltm8_1x1, ltm8_2x1, ltm8_4x2, ltm8_5x7, ltm8_...",1361a5db-d135-4e98-bb49-7a53c8d72991
3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[ltm8_1x1, ltm8_2x1, ltm8_2x1, ltm8_3x4, ltm8_...",1b4284d2-bee2-4a15-abab-aad861447308
4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[ltm8_1x1, ltm8_3x2, ltm8_4x2, ltm8_5x1, ltm8_...",1cc7b7bb-cd90-485f-b744-cf846d566675


In [63]:
test_arr = []
for barcode in test_barcodes:
    for hw_address in encoded_df['HW_Address'].unique():
        read_ids, library_motifs_arr = get_best_quality_reads(barcode, hw_address, n_motifs=0)
        payload = encoded_test_df[(encoded_test_df['ONT_Barcode'] == barcode) & (
            encoded_test_df['HW_Address'] == hw_address)]['Payload'].to_numpy()[0]
        for i, read_id in enumerate(read_ids):
            test_arr.append([
                barcode, hw_address, payload, library_motifs_arr[i], read_id])

In [64]:
empirical_test_df = pd.DataFrame(test_arr, columns=['ONT_Barcode', 'HW_Address', 'Payload', 'Library_Motifs', 'read_id'])

In [65]:
empirical_test_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x6, ltm8_5x4, ltm8_6x1]",0073a9a2-8ee0-4332-9722-72837b77b29c
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_4x8, ltm8_5x4]",00e4308c-baf1-49b1-848a-9c7ff35971e5
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_6x8, ltm8_7x1]",0182d415-b83b-4eeb-9bcd-4f217aba5e8c


In [28]:
empirical_test_df['ONT_Barcode'].unique()

array([ 5, 29, 53])

In [30]:
sampled_test_df = empirical_test_df.sample(frac=0.1, random_state=42)

In [32]:
len(sampled_test_df)

7090

In [24]:
len(empirical_test_df)

70902

In [33]:
empirical_train_df.to_csv("datasets\empirical\paper\empirical_train_read_ids.csv", index=False)
encoded_test_df.to_csv("datasets\empirical\paper\empirical_test_read_ids.csv", index=False)

In [34]:
target_read_ids_train = empirical_train_df['read_id'].to_numpy().tolist()
target_read_ids_test = empirical_test_df['read_id'].to_numpy().tolist()

In [35]:
total_read_ids = target_read_ids_train + target_read_ids_test
len(total_read_ids)

28046

## Extracting the fast5 from the files

In [13]:
# Loading the read id dataframes
import pandas as pd

empirical_train_df_loaded = pd.read_csv("datasets\empirical\paper\empirical_train_read_ids.csv")
empirical_test_df_loaded = pd.read_csv("datasets\empirical\paper\empirical_test_read_ids.csv")
empirical_train_df_loaded.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id
0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_2x1', 'ltm8_3x3', 'ltm8_3x5', 'ltm8_4x1...",0804c886-cd0a-4ece-87ee-adb529974699
1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_3x5', 'ltm8_4x1', 'ltm8_4x1', 'ltm8_6x7...",0f041c54-7071-49d3-8ae2-7a1bf25525ab
2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_4x2', 'ltm8_5x7...",1361a5db-d135-4e98-bb49-7a53c8d72991
3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_2x1', 'ltm8_3x4...",1b4284d2-bee2-4a15-abab-aad861447308
4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_3x2', 'ltm8_4x2', 'ltm8_5x1...",1cc7b7bb-cd90-485f-b744-cf846d566675


In [14]:
target_read_ids_train = empirical_train_df_loaded['read_id'].to_numpy().tolist()
target_read_ids_test = empirical_test_df['read_id'].to_numpy().tolist()
total_read_ids = target_read_ids_train
len(total_read_ids)

23939

In [69]:
target_read_ids = empirical_test_df['read_id'].to_numpy().tolist()

In [67]:

import numpy as np

# Adding placeholder for squiggle
#empirical_train_df_loaded['squiggle'] = [np.zeros(3) for _ in range(empirical_train_df_loaded.shape[0])]
#empirical_test_df['squiggle'] = [np.zeros(3) for _ in range(empirical_test_df.shape[0])]
empirical_test_df['squiggle'] = [np.zeros(3) for _ in range(empirical_test_df.shape[0])]

In [68]:
empirical_test_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2,"[0.0, 0.0, 0.0]"
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x6, ltm8_5x4, ltm8_6x1]",0073a9a2-8ee0-4332-9722-72837b77b29c,"[0.0, 0.0, 0.0]"
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_4x8, ltm8_5x4]",00e4308c-baf1-49b1-848a-9c7ff35971e5,"[0.0, 0.0, 0.0]"
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac,"[0.0, 0.0, 0.0]"
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_6x8, ltm8_7x1]",0182d415-b83b-4eeb-9bcd-4f217aba5e8c,"[0.0, 0.0, 0.0]"


In [70]:

import os
from ont_fast5_api.fast5_interface import get_fast5_file

fast5_path = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\data\helixworks_1280_fast5_2024-02-09_1638\1280_FAST5"

total_files = len(os.listdir(fast5_path))
file_number = 0
read_ids_found = []

squiggles = {}

# Trying to see if I can get 10 percent of the reads and have some form of a database
for file in os.listdir(fast5_path):
    file_number+=1

    # Printing the progress
    if file_number % 10 == 0:
        print(file_number/total_files)

    filepath = os.path.join(fast5_path, file)
    with get_fast5_file(filepath, mode="r") as f5:
        read_ids = f5.get_read_ids()
        read_ids = list(set(read_ids).intersection(target_read_ids))

        for read_id in read_ids:
            read = f5.get_read(read_id)
            squiggle = read.get_raw_data()
            squiggles[read_id] = squiggle

0.013966480446927373
0.027932960893854747
0.04189944134078212
0.055865921787709494
0.06983240223463687
0.08379888268156424
0.09776536312849161
0.11173184357541899
0.12569832402234637
0.13966480446927373
0.15363128491620112
0.16759776536312848
0.18156424581005587
0.19553072625698323
0.20949720670391062
0.22346368715083798
0.23743016759776536
0.25139664804469275
0.26536312849162014
0.27932960893854747
0.29329608938547486
0.30726256983240224
0.32122905027932963
0.33519553072625696
0.34916201117318435
0.36312849162011174
0.3770949720670391
0.39106145251396646
0.40502793296089384
0.41899441340782123
0.4329608938547486
0.44692737430167595
0.46089385474860334
0.4748603351955307
0.4888268156424581
0.5027932960893855
0.5167597765363129
0.5307262569832403
0.5446927374301676
0.5586592178770949
0.5726256983240223
0.5865921787709497
0.6005586592178771
0.6145251396648045
0.6284916201117319
0.6424581005586593
0.6564245810055865
0.6703910614525139
0.6843575418994413
0.6983240223463687
0.71229050279329

In [71]:


# seperate the squiggles into train and test squiggles
#train_squiggles = {k:squiggles[k] for k in squiggles.keys() if k in target_read_ids_train}
test_squiggles = {k:squiggles[k] for k in squiggles.keys() if k in target_read_ids}

In [72]:
len(test_squiggles)#, len(test_squiggles)

69048

In [73]:
sampled_test_df = empirical_test_df

In [74]:
# get subset of dataframe with only those read ids
#empirical_train_df_subset = empirical_train_df_loaded[empirical_train_df_loaded['read_id'].isin(train_squiggles.keys())]
sampled_test_df = sampled_test_df[sampled_test_df['read_id'].isin(test_squiggles.keys())]

In [75]:
# Add the squiggles to the dataframe - matching by the read id
#empirical_train_df_subset['squiggle'] = empirical_train_df_subset['read_id'].apply(lambda x: train_squiggles[x])
sampled_test_df['squiggle'] = sampled_test_df['read_id'].apply(lambda x: test_squiggles[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_test_df['squiggle'] = sampled_test_df['read_id'].apply(lambda x: test_squiggles[x])


In [76]:
sampled_test_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2,"[648, 482, 468, 487, 489, 488, 481, 576, 653, ..."
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x6, ltm8_5x4, ltm8_6x1]",0073a9a2-8ee0-4332-9722-72837b77b29c,"[613, 512, 537, 530, 506, 509, 499, 493, 494, ..."
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_4x8, ltm8_5x4]",00e4308c-baf1-49b1-848a-9c7ff35971e5,"[611, 452, 447, 451, 438, 459, 450, 445, 446, ..."
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac,"[511, 555, 550, 521, 679, 684, 682, 659, 669, ..."
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_6x8, ltm8_7x1]",0182d415-b83b-4eeb-9bcd-4f217aba5e8c,"[480, 536, 547, 484, 519, 506, 521, 513, 509, ..."


In [77]:
len(sampled_test_df)

69048

Cool - that is guessing cycle position - let us just part into motifs

In [78]:
Motifs = sampled_test_df['Library_Motifs'].to_numpy()

motifs_updated = []
for motif in Motifs:
    local_motifs = [int(i[-1]) for i in motif]
    motifs_updated.append(local_motifs)

In [79]:
sampled_test_df['Motifs'] = motifs_updated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_test_df['Motifs'] = motifs_updated


In [39]:
Motifs = empirical_test_df_subset['Library_Motifs'].to_numpy()

motifs_updated = []
for motif in Motifs:
    local_motifs = [int(i[-1]) for i in motif]
    motifs_updated.append(local_motifs)

empirical_test_df_subset['Motifs'] = motifs_updated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empirical_test_df_subset['Motifs'] = motifs_updated


In [80]:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize the squiggle data
sampled_test_df['squiggle'] = sampled_test_df['squiggle'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).reshape(-1))

sampled_test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_test_df['squiggle'] = sampled_test_df['squiggle'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).reshape(-1))


Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2,"[0.5791722622390428, 0.5589061164692956, 0.557...","[1, 8, 4, 1, 8, 8, 4]"
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x6, ltm8_5x4, ltm8_6x1]",0073a9a2-8ee0-4332-9722-72837b77b29c,"[0.5748992796972286, 0.5625686729337077, 0.565...","[1, 6, 4, 1]"
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_4x8, ltm8_5x4]",00e4308c-baf1-49b1-848a-9c7ff35971e5,"[0.9999999999999999, 0.6249999999999999, 0.613...","[8, 4]"
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac,"[0.694006309148265, 0.7634069400630915, 0.7555...","[1, 1, 7, 8, 4, 4, 7]"
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_6x8, ltm8_7x1]",0182d415-b83b-4eeb-9bcd-4f217aba5e8c,"[0.5516483516483517, 0.6747252747252748, 0.698...","[8, 1]"


In [40]:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize the squiggle data
empirical_test_df_subset['squiggle'] = empirical_test_df_subset['squiggle'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).reshape(-1))

empirical_test_df_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empirical_test_df_subset['squiggle'] = empirical_test_df_subset['squiggle'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).reshape(-1))


Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2,"[0.5791722622390428, 0.5589061164692956, 0.557...","[1, 8, 4, 1, 8, 8, 4]"
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac,"[0.694006309148265, 0.7634069400630915, 0.7555...","[1, 1, 7, 8, 4, 4, 7]"
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x6, ltm8_5x6, ltm8_6x5, ltm8_...",03022dc4-6086-481a-bb42-c935846ab94c,"[0.7538994800693242, 0.48526863084922006, 0.45...","[1, 6, 6, 5, 8, 4, 5]"
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x3, ltm8_4x5, ltm8_5x1, ltm8_...",1a7bac06-9b7b-4cb5-88f2-bdfc19772c4e,"[0.5326492537313433, 0.396455223880597, 0.3973...","[1, 3, 5, 1, 6, 7, 1]"
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x3, ltm8_4x4, ltm8_5x4, ltm8_...",2c9c7e7b-51b6-468b-a3cf-75e48085c59f,"[0.5729459162495422, 0.5515810035404712, 0.553...","[1, 3, 4, 4, 2, 4, 3]"


In [41]:
squiggles = empirical_test_df_subset['squiggle'].to_numpy()
print(len(squiggles[3]))

8400


In [None]:
empirical_test

In [42]:
empirical_test_df_subset.to_pickle("empirical_test_dataset_v3.pkl")

### Loading and checking integrity of the saved data

In [46]:
empirical_test = pd.read_pickle("empirical_test_dataset_v3.pkl")

#pd.read_csv("empirical_train_dataset_v3.csv")


In [47]:
empirical_test.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2,"[0.5791722622390428, 0.5589061164692956, 0.557...","[1, 8, 4, 1, 8, 8, 4]"
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac,"[0.694006309148265, 0.7634069400630915, 0.7555...","[1, 1, 7, 8, 4, 4, 7]"
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x6, ltm8_5x6, ltm8_6x5, ltm8_...",03022dc4-6086-481a-bb42-c935846ab94c,"[0.7538994800693242, 0.48526863084922006, 0.45...","[1, 6, 6, 5, 8, 4, 5]"
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x3, ltm8_4x5, ltm8_5x1, ltm8_...",1a7bac06-9b7b-4cb5-88f2-bdfc19772c4e,"[0.5326492537313433, 0.396455223880597, 0.3973...","[1, 3, 5, 1, 6, 7, 1]"
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x3, ltm8_4x4, ltm8_5x4, ltm8_...",2c9c7e7b-51b6-468b-a3cf-75e48085c59f,"[0.5729459162495422, 0.5515810035404712, 0.553...","[1, 3, 4, 4, 2, 4, 3]"


In [45]:
squiggles = empirical_train_df_subset_['squiggle'].to_numpy()
print(len(squiggles[3]))

8400


In [30]:
# load and check the data
empirical_train_df_subset_loaded = pd.read_csv("empirical_train_dataset_v3.csv")


empirical_train_df_subset_loaded.head()

Unnamed: 0.1,Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs
0,0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_2x1', 'ltm8_3x3', 'ltm8_3x5', 'ltm8_4x1...",0804c886-cd0a-4ece-87ee-adb529974699,[0.725 0.735 0.7525 ... 0.2275 0.2225 0.3225],"[1, 3, 5, 1, 1, 4, 5]"
1,1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_3x5', 'ltm8_4x1', 'ltm8_4x1', 'ltm8_6x7...",0f041c54-7071-49d3-8ae2-7a1bf25525ab,[0.525 0.48452381 0.50357143 ... 0.286904...,"[5, 1, 1, 7, 7, 7, 8]"
2,2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_4x2', 'ltm8_5x7...",1361a5db-d135-4e98-bb49-7a53c8d72991,[0.66757991 0.50958904 0.51506849 ... 0.558904...,"[1, 1, 2, 7, 4, 5, 6, 5]"
3,3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_2x1', 'ltm8_3x4...",1b4284d2-bee2-4a15-abab-aad861447308,[0.68461538 0.54615385 0.54923077 ... 0.490769...,"[1, 1, 1, 4, 4, 6, 6]"
4,4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_3x2', 'ltm8_4x2', 'ltm8_5x1...",1cc7b7bb-cd90-485f-b744-cf846d566675,[0.7113164 0.66281755 0.66281755 ... 0.501154...,"[1, 2, 2, 1, 4, 8, 3, 3, 5]"


In [33]:
len(empirical_train_df_subset_loaded['squiggle'].to_numpy()[0])

47

In [61]:
squiggles = empirical_train_df_subset_loaded['squiggle'].to_numpy()

In [116]:
empirical_train_df_subset.to_pickle("empirical_train_dataset_v2.pkl")

In [28]:
len(empirical_train_df_subset_loaded['squiggle'].to_numpy()[3])

NameError: name 'empirical_train_df_subset_loaded' is not defined

In [None]:
from sklearn.preprocessing import MinMaxScaler

t = [3.4, 2.3, 4.6]

MinMaxScaler().fit_transform(x.reshape(-1, 1)).reshape(-1)

In [2]:

import pandas as pd

empirical_test_df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\datasets\empirical\empirical_test_dataset_v3.pkl")

In [3]:
empirical_test_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2,"[0.5791722622390428, 0.5589061164692956, 0.557...","[1, 8, 4, 1, 8, 8, 4]"
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac,"[0.694006309148265, 0.7634069400630915, 0.7555...","[1, 1, 7, 8, 4, 4, 7]"
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x6, ltm8_5x6, ltm8_6x5, ltm8_...",03022dc4-6086-481a-bb42-c935846ab94c,"[0.7538994800693242, 0.48526863084922006, 0.45...","[1, 6, 6, 5, 8, 4, 5]"
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x3, ltm8_4x5, ltm8_5x1, ltm8_...",1a7bac06-9b7b-4cb5-88f2-bdfc19772c4e,"[0.5326492537313433, 0.396455223880597, 0.3973...","[1, 3, 5, 1, 6, 7, 1]"
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x3, ltm8_4x4, ltm8_5x4, ltm8_...",2c9c7e7b-51b6-468b-a3cf-75e48085c59f,"[0.5729459162495422, 0.5515810035404712, 0.553...","[1, 3, 4, 4, 2, 4, 3]"


### Adding spacers

In [48]:

empirical_dataset_path = r"datasets\empirical\empirical_train_dataset_v3.pkl"

empirical_train_df_spacers = pd.read_pickle(empirical_dataset_path)

In [49]:
empirical_train_df_spacers.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs
0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_2x1', 'ltm8_3x3', 'ltm8_3x5', 'ltm8_4x1...",0804c886-cd0a-4ece-87ee-adb529974699,"[0.7250000000000001, 0.7350000000000001, 0.752...","[1, 3, 5, 1, 1, 4, 5]"
1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_3x5', 'ltm8_4x1', 'ltm8_4x1', 'ltm8_6x7...",0f041c54-7071-49d3-8ae2-7a1bf25525ab,"[0.525, 0.4845238095238096, 0.5035714285714287...","[5, 1, 1, 7, 7, 7, 8]"
2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_4x2', 'ltm8_5x7...",1361a5db-d135-4e98-bb49-7a53c8d72991,"[0.6675799086757991, 0.5095890410958903, 0.515...","[1, 1, 2, 7, 4, 5, 6, 5]"
3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_2x1', 'ltm8_3x4...",1b4284d2-bee2-4a15-abab-aad861447308,"[0.6846153846153846, 0.5461538461538461, 0.549...","[1, 1, 1, 4, 4, 6, 6]"
4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_3x2', 'ltm8_4x2', 'ltm8_5x1...",1cc7b7bb-cd90-485f-b744-cf846d566675,"[0.7113163972286374, 0.6628175519630484, 0.662...","[1, 2, 2, 1, 4, 8, 3, 3, 5]"


In [81]:

def extract_cycle_motif(library_motif):
    
        if library_motif[6] == 'x':
            cycle_number = int(library_motif[5])
            motif_number = int(library_motif[7])
        else:
            cycle_number = int(library_motif[5:7])
            motif_number = int(library_motif[8])
        

        return cycle_number, motif_number

def create_spacer_sequence(library_motifs):

    cycle_number = 0
    motif_number = 0

    spacer_sequence = []

    for i in library_motifs:

        cycle_number, motif_number = extract_cycle_motif(i)

        # So we add a prior spacer and then a post spacer, cause that's what a motif is
        spacer_sequence.append(cycle_number + 8)
        spacer_sequence.append(motif_number)
        spacer_sequence.append(cycle_number + 8)

        if i == 1:
            motif_number+=1
        else:
            cycle_number+=1

    return spacer_sequence

spacer_sequences = []
for i in sampled_test_df['Library_Motifs']:
    spacer_sequences.append(create_spacer_sequence(i))

In [56]:

def create_payload_spacer_sequence(payload):

    payload_sequence = []

    cycle_number = 9

    for i in payload:
        for j in i:
            payload_sequence.append(cycle_number)
            payload_sequence.append(j)
            payload_sequence.append(cycle_number)
        cycle_number+=1

    return payload_sequence



create_payload_spacer_sequence(sampled_test_df['Payload'].to_numpy()[0])

2


In [50]:
spacer_sequences

[[10, 8, 10, 14, 1, 14, 16, 3, 16, 18, 7, 18],
 [11, 3, 11, 13, 5, 13, 14, 5, 14, 18, 6, 18],
 [14, 6, 14, 15, 2, 15, 16, 4, 16],
 [13, 1, 13, 14, 6, 14],
 [9, 5, 9],
 [11, 2, 11, 12, 6, 12, 14, 1, 14, 15, 8, 15, 16, 3, 16, 18, 8, 18],
 [11, 8, 11, 14, 3, 14],
 [11, 3, 11, 13, 7, 13, 14, 3, 14, 17, 3, 17],
 [10, 2, 10, 11, 4, 11, 12, 4, 12],
 [9,
  4,
  9,
  10,
  5,
  10,
  12,
  2,
  12,
  13,
  3,
  13,
  13,
  6,
  13,
  14,
  8,
  14,
  17,
  2,
  17,
  18,
  7,
  18],
 [9, 1, 9, 10, 1, 10, 14, 3, 14],
 [17, 3, 17, 18, 7, 18],
 [9, 3, 9, 10, 6, 10, 11, 1, 11],
 [9, 1, 9, 12, 6, 12, 18, 5, 18],
 [16, 3, 16, 18, 1, 18],
 [10, 6, 10, 11, 1, 11, 13, 2, 13, 14, 2, 14, 15, 4, 15, 16, 2, 16],
 [11, 1, 11, 12, 6, 12, 13, 5, 13],
 [9, 7, 9, 17, 1, 17, 18, 4, 18],
 [11, 2, 11, 14, 1, 14, 16, 8, 16],
 [9, 5, 9, 10, 5, 10, 11, 5, 11, 12, 2, 12, 13, 7, 13, 18, 3, 18],
 [10, 2, 10, 11, 1, 11, 11, 6, 11, 12, 7, 12],
 [11, 1, 11, 13, 1, 13, 14, 3, 14, 15, 8, 15],
 [12, 4, 12],
 [9, 1, 9, 12, 2, 1

Unique spacers for each cycle - assigning both types of spacers to the same type - it should have enough data to classify it anyway. However, there is a lot of space for ambiguity so the ground truth might be useful. Anyway, let's put this on and see what happens.

In [82]:
sampled_test_df['Spacer_Sequence'] = spacer_sequences

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_test_df['Spacer_Sequence'] = spacer_sequences


In [83]:
sampled_test_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs,Spacer_Sequence
0,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_2x1, ltm8_3x8, ltm8_4x4, ltm8_5x1, ltm8_...",0038e7e2-ab7a-4e8e-a9b5-d39eefa8b0f2,"[0.5791722622390428, 0.5589061164692956, 0.557...","[1, 8, 4, 1, 8, 8, 4]","[10, 1, 10, 11, 8, 11, 12, 4, 12, 13, 1, 13, 1..."
1,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_3x6, ltm8_5x4, ltm8_6x1]",0073a9a2-8ee0-4332-9722-72837b77b29c,"[0.5748992796972286, 0.5625686729337077, 0.565...","[1, 6, 4, 1]","[9, 1, 9, 11, 6, 11, 13, 4, 13, 14, 1, 14]"
2,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_4x8, ltm8_5x4]",00e4308c-baf1-49b1-848a-9c7ff35971e5,"[0.9999999999999999, 0.6249999999999999, 0.613...","[8, 4]","[12, 8, 12, 13, 4, 13]"
3,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_1x1, ltm8_2x1, ltm8_3x7, ltm8_4x8, ltm8_...",017a284c-952d-422e-ba1a-c4d88e06a3ac,"[0.694006309148265, 0.7634069400630915, 0.7555...","[1, 1, 7, 8, 4, 4, 7]","[9, 1, 9, 10, 1, 10, 11, 7, 11, 12, 8, 12, 13,..."
4,5,barcode_external01_internal01,"[[3, 6, 7, 8], [3, 4, 5, 8], [1, 2, 4, 6], [1,...","[ltm8_6x8, ltm8_7x1]",0182d415-b83b-4eeb-9bcd-4f217aba5e8c,"[0.5516483516483517, 0.6747252747252748, 0.698...","[8, 1]","[14, 8, 14, 15, 1, 15]"


In [84]:
len(sampled_test_df)

69048

In [87]:
sampled_test_df.to_pickle("full_test_dataset_v4_spacers.pkl")

In [11]:
empirical_test_df.to_pickle("datasets\empirical\empirical_test_dataset_v4_spacers.pkl")

### Load and verify integrity of data

In [12]:
empirical_test_df_spacers = pd.read_pickle("datasets\empirical\empirical_test_dataset_v4_spacers.pkl")

In [None]:
sampled_test_df = pd.read_pickle("datasets\empirical\sampled_test_dataset_v4_spacers.pkl")

In [54]:
sampled_test_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs,Spacer_Sequence
17549,5,barcode_external01_internal08,"[[2, 4, 6, 8], [2, 3, 4, 8], [2, 3, 7, 8], [1,...","[ltm8_2x8, ltm8_6x1, ltm8_8x3, ltm8_10x7]",2163d2a1-608c-4493-918a-55a719f244fc,"[0.5377697841726619, 0.6133093525179857, 0.622...","[8, 1, 3, 7]","[10, 8, 10, 14, 1, 14, 16, 3, 16, 18, 7, 18]"
62207,53,barcode_external07_internal06,"[[2, 3, 5, 7], [2, 4, 6, 7], [2, 5, 6, 8], [1,...","[ltm8_3x3, ltm8_5x5, ltm8_6x5, ltm8_10x6]",631b887a-5241-4c6e-ac70-6e2beaa8acde,"[0.9029611130931146, 0.8811987156617909, 0.863...","[3, 5, 5, 6]","[11, 3, 11, 13, 5, 13, 14, 5, 14, 18, 6, 18]"
68815,53,barcode_external04_internal08,"[[1, 2, 3, 8], [1, 3, 4, 7], [3, 5, 6, 8], [4,...","[ltm8_6x6, ltm8_7x2, ltm8_8x4]",fcdfb470-efa7-453f-b79e-ecc0dc6ba644,"[0.675564681724846, 0.6255989048596851, 0.6290...","[6, 2, 4]","[14, 6, 14, 15, 2, 15, 16, 4, 16]"
2132,5,barcode_external06_internal01,"[[3, 4, 6, 8], [4, 6, 7, 8], [2, 5, 6, 7], [1,...","[ltm8_5x1, ltm8_6x6]",ca1bbe02-7552-48a7-8df5-a07926990e12,"[0.5178268251273345, 0.565365025466893, 0.6298...","[1, 6]","[13, 1, 13, 14, 6, 14]"
69109,53,barcode_external05_internal08,"[[2, 6, 7, 8], [1, 3, 4, 7], [1, 3, 6, 8], [1,...",[ltm8_1x5],852cfd4d-e177-41aa-a2b5-70775484ef11,"[0.568672933707728, 0.5532901965571969, 0.5539...",[5],"[9, 5, 9]"


## Adding Spacer Sequence

In [3]:

import pandas as pd

empirical_train_df = pd.read_pickle(r"datasets\empirical\empirical_train_dataset_v4_spacers.pkl")

empirical_test_df = pd.read_pickle(r"datasets\empirical\full_test_dataset_v4_spacers.pkl")

In [4]:
empirical_train_df.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs,Spacer_Sequence
0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_2x1', 'ltm8_3x3', 'ltm8_3x5', 'ltm8_4x1...",0804c886-cd0a-4ece-87ee-adb529974699,"[0.7250000000000001, 0.7350000000000001, 0.752...","[1, 3, 5, 1, 1, 4, 5]","[10, 1, 10, 11, 3, 11, 11, 5, 11, 12, 1, 12, 1..."
1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_3x5', 'ltm8_4x1', 'ltm8_4x1', 'ltm8_6x7...",0f041c54-7071-49d3-8ae2-7a1bf25525ab,"[0.525, 0.4845238095238096, 0.5035714285714287...","[5, 1, 1, 7, 7, 7, 8]","[11, 5, 11, 12, 1, 12, 12, 1, 12, 14, 7, 14, 1..."
2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_4x2', 'ltm8_5x7...",1361a5db-d135-4e98-bb49-7a53c8d72991,"[0.6675799086757991, 0.5095890410958903, 0.515...","[1, 1, 2, 7, 4, 5, 6, 5]","[9, 1, 9, 10, 1, 10, 12, 2, 12, 13, 7, 13, 14,..."
3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_2x1', 'ltm8_3x4...",1b4284d2-bee2-4a15-abab-aad861447308,"[0.6846153846153846, 0.5461538461538461, 0.549...","[1, 1, 1, 4, 4, 6, 6]","[9, 1, 9, 10, 1, 10, 10, 1, 10, 11, 4, 11, 13,..."
4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_3x2', 'ltm8_4x2', 'ltm8_5x1...",1cc7b7bb-cd90-485f-b744-cf846d566675,"[0.7113163972286374, 0.6628175519630484, 0.662...","[1, 2, 2, 1, 4, 8, 3, 3, 5]","[9, 1, 9, 11, 2, 11, 12, 2, 12, 13, 1, 13, 14,..."


In [23]:

def create_payload_spacer_sequence(payload):

    payload_sequence = []

    cycle_number = 11

    if type(payload) == str:
        payload = eval(payload)

    for i in payload:
        for j in i:
            payload_sequence.append(cycle_number)
            payload_sequence.append(j)
            payload_sequence.append(cycle_number)
        cycle_number+=1

    return payload_sequence

In [26]:

empirical_train_df['Payload_Sequence'] = empirical_train_df['Payload'].apply(lambda x: create_payload_spacer_sequence(x))

empirical_test_df['Payload_Sequence'] = empirical_test_df['Payload'].apply(lambda x: create_payload_spacer_sequence(x))

empirical_train_df.to_pickle("datasets\empirical\empirical_train_dataset_v5_payload_seq.pkl")

empirical_test_df.to_pickle(r"datasets\empirical\full_empirical_test_dataset_v5_payload_seq.pkl")

In [12]:
def sort_transcript(transcript):

    cycles = [[] for i in range(8)]
    
    if type(transcript) == str:
        transcript = transcript.split()
    
    split_transcript = [int(i) for i in transcript if i != '']
    
    for i in range(len(split_transcript)):

        found_motif = split_transcript[i]

        # If we have a payload motif
        if found_motif < 9:

            # finding the spacers - only for payload cycles
            if i > 0:

                # Checking for Back Spacer
                if split_transcript[i-1] > 10:
                    cycle_number = split_transcript[i-1] - 11
                    cycles[cycle_number].append(split_transcript[i])

                # Checking for Forward Spacer
                elif i < len(split_transcript) - 1:
                    if split_transcript[i+1] > 10:
                        cycle_number = split_transcript[i+1] - 11
                        cycles[cycle_number].append(split_transcript[i])

            else:
                if i < len(split_transcript) - 1:
                    # Checking for Forward Spacer
                    if split_transcript[i+1] > 10:
                        cycle_number = split_transcript[i+1] - 11
                        cycles[cycle_number].append(split_transcript[i])   

    return cycles


def create_payload_spacer_sequence(payload):

    payload_sequence = []

    cycle_number = 11

    if type(payload) == str:
        payload = eval(payload)

    for i in payload:
        for j in i:
            payload_sequence.append(cycle_number)
            payload_sequence.append(j)
            payload_sequence.append(cycle_number)
        cycle_number+=1

    return payload_sequence

## Cleaning Dataset
Removing all motif errors from the label (and the address labels)


In [2]:
import pandas as pd

dataset = pd.read_pickle(r'C:\Users\Parv\Doc\HelixWorks\Basecalling\code\datasets\empirical\empirical_train_dataset_v5_payload_seq.pkl')

In [3]:
dataset.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs,Spacer_Sequence,Payload_Sequence
0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_2x1', 'ltm8_3x3', 'ltm8_3x5', 'ltm8_4x1...",0804c886-cd0a-4ece-87ee-adb529974699,"[0.7250000000000001, 0.7350000000000001, 0.752...","[1, 3, 5, 1, 1, 4, 5]","[10, 1, 10, 11, 3, 11, 11, 5, 11, 12, 1, 12, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_3x5', 'ltm8_4x1', 'ltm8_4x1', 'ltm8_6x7...",0f041c54-7071-49d3-8ae2-7a1bf25525ab,"[0.525, 0.4845238095238096, 0.5035714285714287...","[5, 1, 1, 7, 7, 7, 8]","[11, 5, 11, 12, 1, 12, 12, 1, 12, 14, 7, 14, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_4x2', 'ltm8_5x7...",1361a5db-d135-4e98-bb49-7a53c8d72991,"[0.6675799086757991, 0.5095890410958903, 0.515...","[1, 1, 2, 7, 4, 5, 6, 5]","[9, 1, 9, 10, 1, 10, 12, 2, 12, 13, 7, 13, 14,...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_2x1', 'ltm8_2x1', 'ltm8_3x4...",1b4284d2-bee2-4a15-abab-aad861447308,"[0.6846153846153846, 0.5461538461538461, 0.549...","[1, 1, 1, 4, 4, 6, 6]","[9, 1, 9, 10, 1, 10, 10, 1, 10, 11, 4, 11, 13,...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","['ltm8_1x1', 'ltm8_3x2', 'ltm8_4x2', 'ltm8_5x1...",1cc7b7bb-cd90-485f-b744-cf846d566675,"[0.7113163972286374, 0.6628175519630484, 0.662...","[1, 2, 2, 1, 4, 8, 3, 3, 5]","[9, 1, 9, 11, 2, 11, 12, 2, 12, 13, 1, 13, 14,...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."


In [10]:
library_motifs = dataset['Spacer_Sequence'].to_list()
payload_motifs = dataset['Payload'].to_list()

In [11]:
library_motifs = [sort_transcript(i) for i in library_motifs]

In [19]:
counter = 0

corrected_library_motifs = []
corrected_spacer_sequences = []
for i, j in zip(library_motifs, payload_motifs):
    corrected_cycles = []
    j = eval(j)
    for library_cycle, payload_cycle in zip(i, j):
        corrected_cycle = list(set([motif for motif in library_cycle if motif in payload_cycle]))
        incorrect_cycles = [motif for motif in library_cycle if motif not in payload_cycle]
        counter += len(incorrect_cycles)
        corrected_cycles.append(corrected_cycle)

    corrected_library_motifs.append(corrected_cycles)
    corrected_spacer_sequences.append(create_payload_spacer_sequence(corrected_cycles))
    print(counter)


0
0
1
1
1
1
1
5
5
8
8
12
12
12
12
12
12
17
17
17
17
19
19
23
23
23
23
23
23
27
27
27
27
27
27
27
27
27
27
27
27
27
31
31
31
31
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
33
36
36
40
40
40
40
40
40
40
41
41
41
41
41
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
43
47
47
47
47
47
47
47
47
47
47
47
47
47
47
47
47
49
49
49
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
54
59
59
59
59
59
59
59
59
59
59
59
59
59
59
59
63
63
63
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
68
71
71
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
72
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75


In [20]:
corrected_library_motifs

[[[3, 5], [1], [1], [4], [5], [], [], []],
 [[5], [1], [], [7], [7], [8], [], []],
 [[], [2], [], [4], [5], [6], [5], []],
 [[4], [], [4], [6], [], [6], [], []],
 [[2], [2], [1], [4], [8], [3], [3], [5]],
 [[], [], [5], [6], [2], [3], [3], [7]],
 [[4], [], [], [8], [2], [5], [1, 5], []],
 [[3], [2], [], [4], [], [], [], []],
 [[], [8], [6], [4], [2], [], [5], []],
 [[], [8], [1], [7], [7], [], [], []],
 [[2], [7], [], [7], [5], [6], [5], [6]],
 [[], [], [], [8], [7], [], [], []],
 [[3], [], [], [8], [7], [5], [], [1]],
 [[2], [1], [6], [], [], [3], [1], []],
 [[5], [1], [6], [7], [], [], [1], [6]],
 [[4], [1], [1], [], [], [], [5], [5]],
 [[], [1], [1], [], [5], [3], [5], []],
 [[], [], [4, 6], [], [], [], [], []],
 [[], [1], [6], [4], [], [], [7], [5]],
 [[], [8], [5, 6], [], [], [8], [3], []],
 [[5], [1], [6], [4], [7], [6], [], [6]],
 [[], [], [], [], [7], [6], [7], [6]],
 [[], [], [1], [8, 7], [8], [3, 6], [], []],
 [[], [7], [], [], [5], [], [], []],
 [[], [1], [1, 5], [4], [7], [

In [23]:
dataset['Spacer_Sequence']

0        [10, 1, 10, 11, 3, 11, 11, 5, 11, 12, 1, 12, 1...
1        [11, 5, 11, 12, 1, 12, 12, 1, 12, 14, 7, 14, 1...
2        [9, 1, 9, 10, 1, 10, 12, 2, 12, 13, 7, 13, 14,...
3        [9, 1, 9, 10, 1, 10, 10, 1, 10, 11, 4, 11, 13,...
4        [9, 1, 9, 11, 2, 11, 12, 2, 12, 13, 1, 13, 14,...
                               ...                        
23934    [9, 8, 9, 12, 4, 12, 13, 1, 13, 14, 8, 14, 15,...
23935    [9, 8, 9, 10, 8, 10, 12, 4, 12, 13, 2, 13, 14,...
23936    [9, 8, 9, 11, 3, 11, 12, 2, 12, 13, 4, 13, 14,...
23937    [10, 8, 10, 11, 4, 11, 12, 2, 12, 13, 4, 13, 1...
23938    [9, 8, 9, 10, 8, 10, 11, 5, 11, 12, 5, 12, 13,...
Name: Spacer_Sequence, Length: 23391, dtype: object

In [24]:
corrected_spacer_sequences

[[11, 3, 11, 11, 5, 11, 12, 1, 12, 13, 1, 13, 14, 4, 14, 15, 5, 15],
 [11, 5, 11, 12, 1, 12, 14, 7, 14, 15, 7, 15, 16, 8, 16],
 [12, 2, 12, 14, 4, 14, 15, 5, 15, 16, 6, 16, 17, 5, 17],
 [11, 4, 11, 13, 4, 13, 14, 6, 14, 16, 6, 16],
 [11,
  2,
  11,
  12,
  2,
  12,
  13,
  1,
  13,
  14,
  4,
  14,
  15,
  8,
  15,
  16,
  3,
  16,
  17,
  3,
  17,
  18,
  5,
  18],
 [13, 5, 13, 14, 6, 14, 15, 2, 15, 16, 3, 16, 17, 3, 17, 18, 7, 18],
 [11, 4, 11, 14, 8, 14, 15, 2, 15, 16, 5, 16, 17, 1, 17, 17, 5, 17],
 [11, 3, 11, 12, 2, 12, 14, 4, 14],
 [12, 8, 12, 13, 6, 13, 14, 4, 14, 15, 2, 15, 17, 5, 17],
 [12, 8, 12, 13, 1, 13, 14, 7, 14, 15, 7, 15],
 [11, 2, 11, 12, 7, 12, 14, 7, 14, 15, 5, 15, 16, 6, 16, 17, 5, 17, 18, 6, 18],
 [14, 8, 14, 15, 7, 15],
 [11, 3, 11, 14, 8, 14, 15, 7, 15, 16, 5, 16, 18, 1, 18],
 [11, 2, 11, 12, 1, 12, 13, 6, 13, 16, 3, 16, 17, 1, 17],
 [11, 5, 11, 12, 1, 12, 13, 6, 13, 14, 7, 14, 17, 1, 17, 18, 6, 18],
 [11, 4, 11, 12, 1, 12, 13, 1, 13, 17, 5, 17, 18, 5, 18],
 [12

In [25]:
dataset['Spacer_Sequence'] = corrected_spacer_sequences
dataset['Library_Motifs'] = corrected_library_motifs

In [26]:
dataset.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs,Spacer_Sequence,Payload_Sequence
0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[3, 5], [1], [1], [4], [5], [], [], []]",0804c886-cd0a-4ece-87ee-adb529974699,"[0.7250000000000001, 0.7350000000000001, 0.752...","[1, 3, 5, 1, 1, 4, 5]","[11, 3, 11, 11, 5, 11, 12, 1, 12, 13, 1, 13, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[5], [1], [], [7], [7], [8], [], []]",0f041c54-7071-49d3-8ae2-7a1bf25525ab,"[0.525, 0.4845238095238096, 0.5035714285714287...","[5, 1, 1, 7, 7, 7, 8]","[11, 5, 11, 12, 1, 12, 14, 7, 14, 15, 7, 15, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[], [2], [], [4], [5], [6], [5], []]",1361a5db-d135-4e98-bb49-7a53c8d72991,"[0.6675799086757991, 0.5095890410958903, 0.515...","[1, 1, 2, 7, 4, 5, 6, 5]","[12, 2, 12, 14, 4, 14, 15, 5, 15, 16, 6, 16, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[4], [], [4], [6], [], [6], [], []]",1b4284d2-bee2-4a15-abab-aad861447308,"[0.6846153846153846, 0.5461538461538461, 0.549...","[1, 1, 1, 4, 4, 6, 6]","[11, 4, 11, 13, 4, 13, 14, 6, 14, 16, 6, 16]","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[2], [2], [1], [4], [8], [3], [3], [5]]",1cc7b7bb-cd90-485f-b744-cf846d566675,"[0.7113163972286374, 0.6628175519630484, 0.662...","[1, 2, 2, 1, 4, 8, 3, 3, 5]","[11, 2, 11, 12, 2, 12, 13, 1, 13, 14, 4, 14, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."


In [28]:
dataset.to_pickle(r'C:\Users\Parv\Doc\HelixWorks\Basecalling\code\datasets\empirical\empirical_train_dataset_v6.pkl')

In [29]:
dd = pd.read_pickle(r'C:\Users\Parv\Doc\HelixWorks\Basecalling\code\datasets\empirical\empirical_train_dataset_v6.pkl')

In [30]:
dd.head()

Unnamed: 0,ONT_Barcode,HW_Address,Payload,Library_Motifs,read_id,squiggle,Motifs,Spacer_Sequence,Payload_Sequence
0,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[3, 5], [1], [1], [4], [5], [], [], []]",0804c886-cd0a-4ece-87ee-adb529974699,"[0.7250000000000001, 0.7350000000000001, 0.752...","[1, 3, 5, 1, 1, 4, 5]","[11, 3, 11, 11, 5, 11, 12, 1, 12, 13, 1, 13, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
1,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[5], [1], [], [7], [7], [8], [], []]",0f041c54-7071-49d3-8ae2-7a1bf25525ab,"[0.525, 0.4845238095238096, 0.5035714285714287...","[5, 1, 1, 7, 7, 7, 8]","[11, 5, 11, 12, 1, 12, 14, 7, 14, 15, 7, 15, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
2,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[], [2], [], [4], [5], [6], [5], []]",1361a5db-d135-4e98-bb49-7a53c8d72991,"[0.6675799086757991, 0.5095890410958903, 0.515...","[1, 1, 2, 7, 4, 5, 6, 5]","[12, 2, 12, 14, 4, 14, 15, 5, 15, 16, 6, 16, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
3,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[4], [], [4], [6], [], [6], [], []]",1b4284d2-bee2-4a15-abab-aad861447308,"[0.6846153846153846, 0.5461538461538461, 0.549...","[1, 1, 1, 4, 4, 6, 6]","[11, 4, 11, 13, 4, 13, 14, 6, 14, 16, 6, 16]","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
4,1,barcode_external01_internal01,"[[2, 3, 4, 5], [1, 2, 7, 8], [1, 4, 5, 6], [4,...","[[2], [2], [1], [4], [8], [3], [3], [5]]",1cc7b7bb-cd90-485f-b744-cf846d566675,"[0.7113163972286374, 0.6628175519630484, 0.662...","[1, 2, 2, 1, 4, 8, 3, 3, 5]","[11, 2, 11, 12, 2, 12, 13, 1, 13, 14, 4, 14, 1...","[11, 2, 11, 11, 3, 11, 11, 4, 11, 11, 5, 11, 1..."
