## Empirical dataset creation

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from typing import List
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from utils import sort_transcript
from evaluation import evaluate_cycle_prediction
from transcript_sorting import sort_transcript, create_reduced_spacer_transcript, sort_transcript_reduced_spacers

In [None]:
train_dataset_path = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\datasets\empirical\empirical_train_dataset_v6.pkl"
test_dataset_path = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\datasets\empirical\full_empirical_test_dataset_v5_payload_seq.pkl"
motif_search_barcoded = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\motif_search_barcoded.csv"

In [None]:
train_df = pd.read_pickle(train_dataset_path)

In [None]:
orientation_df = pd.read_csv(motif_search_barcoded)

In [None]:
cols_to_add = [col for col in orientation_df.columns if col not in train_df.columns or col == 'read_id']

In [None]:
merged_df = pd.merge(train_df, orientation_df[cols_to_add], on='read_id')

In [None]:
train_df

In [None]:
def get_df_for_barcode_address(df: pd.DataFrame, barcode: int, address: str) -> pd.DataFrame:
    return df.loc[
        (df['ONT_Barcode'] == barcode) & 
        (df['HW_Address'] == address) &
        (df['orientation'].str.startswith('+'))

    ]

def sort_sequences_by_length(spacer_sequences: List[str], read_ids: List[str]):

    sorted_indices = sorted(
        range(len(spacer_sequences)), key=lambda i: len(spacer_sequences[i]), reverse=True)
    
    return [spacer_sequences[ind] for ind in sorted_indices], [read_ids[ind] for ind in sorted_indices]
        

def get_longest_sequence(spacer_sequences: List[str], read_ids: List[str]):
    max_len = 0
    max_seq = 0
    read_id = ""
    for seq, r_id in zip(spacer_sequences, read_ids):
        if len(seq) > max_len:
            max_len = len(seq)
            max_seq = seq
            read_id = r_id

    return max_seq, read_id

In [None]:

seqs_arr = []
read_ids_arr = []

for barcode in tqdm(merged_df['ONT_Barcode'].unique()):
    for address in merged_df['HW_Address'].unique():
        selected_df = get_df_for_barcode_address(
            df=merged_df, barcode=barcode, address=address)
        
        #seq, read_id = get_longest_sequence(
        #    selected_df['Spacer_Sequence'], selected_df['read_id'])
        
        seqs, read_ids = sort_sequences_by_length(
            selected_df['Spacer_Sequence'].tolist(), selected_df['read_id'].tolist())

        
        seqs_arr.extend(seqs[:10])
        read_ids_arr.extend(read_ids[:10])


In [None]:
def create_reduced_spacer_transcript(motif_seq: List[int]) -> List[int]:
    """ 12 4 12 12 3 12 -> 12 4 2 3 4 12 13 2 4 5 3 13"""

    seq = []
    str_seq = " ".join([str(i) for i in motif_seq])
    cycle_transcript = sort_transcript(str_seq)

    for ind, i in enumerate(cycle_transcript):
        if len(i) == 0:
            continue
        
        seq.append(ind + 11)
        seq.extend(list(set(i)))
        seq.append(ind + 11)

    return seq
    

In [None]:
for i in seqs_arr:
    sorted_transcript = sort_transcript(i)
    reduced_spacers_str = create_reduced_spacer_transcript(i)
    reduced_spacers_transcript = sort_transcript_reduced_spacers(reduced_spacers_str)

    print(sorted_transcript)
    print(reduced_spacers_transcript)
    print()
    

In [None]:
plt.hist([len(i)/3 for i in seqs_arr])

In [None]:
filtered_df = merged_df[merged_df['read_id'].isin(read_ids_arr)]

In [None]:
filtered_df = filtered_df.rename(columns={'Spacer_Sequence': 'motif_seq'})

In [None]:
filtered_df['motif_seq'] = filtered_df['motif_seq'].apply(create_reduced_spacer_transcript)

In [None]:
filtered_df.to_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\pickled_datasets\cleaned_+_reduced_spacers_5_per_read.pkl")

In [None]:
filtered_df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\pickled_datasets\cleaned_+_reduced_spacers_5_per_cycle.pkl")


filtered_df

In [None]:
for i in filtered_df['motif_seq']:
    print(i)
    t = sort_transcript_reduced_spacers(" ".join([str(k) for k in i]))
    print(t)
    t = [list(set(f)) for f in t]
    print(t)
    print()

In [None]:
import pandas as pd

In [None]:
train_filt = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\full_datasets\master_train.pkl")

In [None]:
train_filt

In [None]:
train_filt_ = train_filt.loc[(train_filt['orientation'].str.startswith('-')) & (train_filt['payload_motifs_found'] > 3)]

In [None]:
len(train_filt_)

In [None]:
train_filt_

In [None]:
train_filt_.to_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\full_datasets\reverse_oriented.pkl")

In [None]:
train_filt['payload_motifs_found'] > 4

In [None]:
train_filt_.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\full_datasets\reverse_oriented.pkl")

### Fixing reverse oriented labels

In [None]:
import pandas as pd

In [None]:
df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\full_datasets\edit_master_train.pkl")

In [None]:
rev_df = df.loc[df['strand'].str.startswith('-')]

In [None]:
rev_df = rev_df.loc[rev_df['edit_motifs_found'] > 7]

In [None]:
rev_df['edit_spacer_seq'] = rev_df['edit_spacer_seq'].apply(lambda x: x[::-1])

In [None]:
rev_df.to_pickle(r'C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\full_datasets\edit_train_rev.pkl')

### Testing labels for classifer 

In [None]:
df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\edit_distance_motif_search\edit_train_filtered_reverse.pkl")

In [None]:
df['edit_spacer_seq'] = df['edit_spacer_seq'].apply(lambda x: x[::-1])

### Testing motif search labels

In [None]:
import pandas as pd

In [None]:
master_db = pd.read_csv(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\master_db.csv")

In [None]:
filtered_df = master_db.loc[~master_db['HW_Address'].str.startswith('unclassified')]

In [None]:
from data_functions import get_cleaned_encoded_file
from utils import evaluate_prediction, create_spacer_sequence_with_address
import matplotlib.pyplot as plt
from tqdm import tqdm
from transcript_sorting import sort_transcript_with_address

In [None]:
encoded_df = pd.read_csv(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\HELIX01-04-encoded.csv")

In [None]:
encoded_cleaned = get_cleaned_encoded_file(encoded_df, address=True)

In [None]:
merged_df = pd.merge(filtered_df, encoded_cleaned, on='HW_Address')

In [None]:
merged_df

In [None]:
def sort_library_motif_transcript(library_prediction, encoded, library_typos=True):
    """Sorts library motif transcripts while fixing for typos, by looking at the payloads to get the best orientation matches"""

    sorted_prediction = [[] for i in range(10)]

    split_library_prediction = library_prediction.split('|')

    #print(split_library_prediction)
    for i in split_library_prediction:

        if len(i) < 8:
            continue
        # searching for the 10
        motif_found = int(i[-1])
        cycle_address = None  # starts from 1 there and from 0 here
        if i[-4] == '1':
            if i[-3] == '0':
                cycle_address = 10
                sorted_prediction[cycle_address - 1].append(motif_found)
        elif i[-4] == '9':
            continue
        else:
            cycle_address = int(i[-3])
            sorted_prediction[cycle_address - 1].append(motif_found)

    mf, me = evaluate_prediction(sorted_prediction[2:], encoded)
    return sorted_prediction, mf, me
    

In [None]:
motifs_found = []
motif_errs = []
sorted_predictions = []
spacer_seqs = []
read_ids = []
for ind, row in tqdm(merged_df.iterrows(), total=len(merged_df)):

    library_prediction = row['library_motif']
    payload = row['payload']
    read_id = row['read_id']
    
    try:
        sorted_pred, mf, me = sort_library_motif_transcript(library_prediction, payload)
    except Exception as e:
        continue
    
    #if mf > 6:        
    motifs_found.append(mf)
    motif_errs.append(me)
    sorted_predictions.append(sorted_pred)
    spacer_seqs.append(create_spacer_sequence_with_address(sorted_pred))
    read_ids.append(read_id)

    #if ind == 20000:
    #    break


In [None]:
filtered_df = merged_df.loc[merged_df['read_id'].isin(read_ids)]

In [None]:
filtered_df['sorted_predictions'] = sorted_predictions
filtered_df['motif_seq'] = spacer_seqs

In [None]:
edit_seqs = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\edit_medium.pkl")

In [None]:
edit_seqs.rename(columns={"motif_seq": "edit_pred"}, inplace=True)

In [None]:
merged_df = pd.merge(filtered_df, edit_seqs, on='read_id')

### Testing edit search performance

In [None]:
rev_df.columns

In [None]:
edit_found_arr = []
edit_errs_arr = []

for ind, row in merged_df.iterrows():

    #search_pred = row['sorted_predictions']
    edit_pred = row['edit_pred']
    payload = row['payload']

    edit_found, edit_errs = evaluate_prediction(edit_pred[2:], payload)
    
    edit_found_arr.append(edit_found)
    edit_errs_arr.append(edit_errs)

In [None]:
merged_df['edit_found'] = edit_found_arr
merged_df['edit_err'] = edit_errs_arr

In [None]:
forward = merged_df.loc[merged_df['orientation'].str.startswith('+')]

In [None]:
filtered_df_ = merged_df[merged_df['edit_found'] > 6]

### Creating test and training sets

In [None]:
test_barcodes = ['barcode_external04_internal01', 'barcode_external01_internal03', 'barcode_external06_internal01', 'barcode_external01_internal02', 'barcode_external08_internal01']

In [None]:
filtered_df_ = filtered_df_.loc[~filtered_df_['HW_Address'].isin(test_barcodes)]

In [None]:
squiggle_df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\master.pkl")

In [None]:
merged_df = pd.merge(filtered_df_, squiggle_df[['squiggle', 'read_id']])

In [None]:
merged_df['edit_spacer_seq'] = merged_df['edit_pred'].apply(lambda x: create_spacer_sequence_with_address(x))

In [None]:
merged_df.to_pickle(r'C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\finetuning_datasets\edit_train.pkl')
# Switch orientation by filtering out please

In [None]:
merged_df.to_pickle(r'C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\finetuning_datasets\1_04_mixed.pkl')

In [None]:
test_df = master_db.loc[master_db['HW_Address'].isin(test_barcodes)]

In [None]:
merged_df = pd.merge(test_df, encoded_cleaned, on='HW_Address')

In [None]:
import pandas as pd

In [None]:
df.to_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\finetuning_datasets\edit_train.pkl")

In [None]:
df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\finetuning_datasets\edit_train.pkl")

In [None]:
df.loc[df['orientation'].str.startswith('-'), 'edit_spacer_seq'] = df.loc[df['orientation'].str.startswith('-'), 'edit_spacer_seq'].str[::-1]

In [None]:
df.to_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\finetuning_datasets\edit_train.pkl")

### Training set 01-04

In [None]:
import pandas as pd

In [None]:
search_df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\01-04run\misc_datasets\zero_error_search.pkl")

In [None]:
unique_barcodes = search_df['HW_Address'].unique()

In [None]:
search_df.columns

In [None]:

master_read_ids = []

for barcode in unique_barcodes:
    filtered_df = search_df.loc[search_df['HW_Address'].str.startswith(barcode)]

    # From this I want to sample - 5000 forward and 5000 reverse
    forward_df = filtered_df.loc[filtered_df['strand'].str.startswith('+')]
    reads_sampled = forward_df.sample(n=5000)['read_id'].tolist()
    master_read_ids.extend(reads_sampled)
    
    # From this I want to sample - 5000 forward and 5000 reverse
    reverse_df = filtered_df.loc[filtered_df['strand'].str.startswith('-')]
    reads_sampled = reverse_df.sample(n=5000)['read_id'].tolist()
    master_read_ids.extend(reads_sampled)

In [None]:
len(set(master_read_ids))

In [None]:
len(master_read_ids)

In [None]:
sorted_df = search_df.loc[search_df['read_id'].isin(master_read_ids)]

In [None]:
sorted_df['orientation'] = sorted_df['strand'].apply(lambda x: 1 if x.startswith('+') else 0)

### Making master fastq

Go through all the fastq files, extract all the reads that are within the master db and then finally write to another fastq file

In [None]:
import os

In [None]:

from Bio import SeqIO
from typing import List

def parse_biopython(input_fastq):
    for record in SeqIO.parse(input_fastq, 'fastq'):
        yield record

def get_fastq_records(fastq_filepath):
    records = []
    for i, record in enumerate((parse_biopython(fastq_filepath))):
        records.append(record)
    return records

def create_fasta_file(ids: List[str], strands: List[str], output_filepath: str):
    with open(output_filepath, 'w') as f:
        for i, strand in enumerate(strands):
            f.write(f">{ids[i]}\n")
            f.write(strand + '\n\n')

    print(f"File saved as {output_filepath}")

In [None]:
fastq_basepath = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\fastq\01-04\FASTQ\pass"

In [None]:
# Assume master_read_ids is a list or something iterable
master_read_ids_set = set(master_read_ids)  # O(1) lookup

def filter_records(records):
    return [record for record in records if str(record.id) in master_read_ids_set]


In [None]:
master_records = []
for file in tqdm(os.listdir(fastq_basepath)):
    records = get_fastq_records(os.path.join(fastq_basepath, file))
    master_records.extend(filter_records(records))

In [None]:
with open("output.fastq", "w") as handle:
    SeqIO.write(master_records, handle, "fastq")

### Random runs dataset creation

In [None]:
import pandas as pd
import os

In [None]:
"""
Steps
1. Load master db files into a dataframe
2. Merge encoded into the master db (will have to fit to a single run)
3. Repeat / Generalise
"""

In [None]:
basepath = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\sequencing_runs"

In [None]:
run_filepath = os.path.join(basepath, "HELIX-01-07-DNA-DECAY")

In [None]:
master_db_filepath = os.path.join(run_filepath, 'master_db.txt')

In [None]:

def get_master_db_df(master_db_filepath):
    with open(master_db_filepath, 'r') as f:
        lines = f.readlines()


    data = {
        "read_id": set(),
        "filename": [],
        "barcode_1": [],
        "barcode_2": [],
        "orientation": []    
    }

    for line in lines:
        split_line = line.split()

        read_id = split_line[0]
        filename = split_line[1]
        barcode_1 = split_line[2]
        barcode_2 = split_line[3]
        orientation = split_line[4]

        if read_id not in data["read_id"]:
            data["read_id"].add(read_id)
            data["filename"].append(filename)
            data["barcode_1"].append(barcode_1)
            data["barcode_2"].append(barcode_2)
            data["orientation"].append(orientation)

    data['read_id'] = list(data['read_id'])
    df = pd.DataFrame(data)
    return df

In [None]:
df = get_master_db_df(master_db_filepath)

In [None]:
encoded_df = pd.read_csv(os.path.join(run_filepath, "encoded.tsv"), sep='\t')

In [None]:
filtered_df = df.loc[~df['barcode_2'].str.startswith('unclassified')]

In [None]:
duble_filter = filtered_df.loc[filtered_df['filename'].str.startswith('FAV33791_0ddbb')]

In [None]:
duble_filter['filename'].value_counts()[:10]

In [None]:
filenames = [
    'FAV33791_0ddbb029_cd7ebfbf_12.fast5',
    'FAV33791_0ddbb029_cd7ebfbf_57.fast5',
    'FAV33791_0ddbb029_cd7ebfbf_75.fast5',
    'FAV33791_0ddbb029_cd7ebfbf_77.fast5',
    'FAV33791_0ddbb029_cd7ebfbf_78.fast5'
    ]

In [None]:
duble_filter = duble_filter.loc[duble_filter['filename'].isin(filenames)]

In [None]:
enc = encoded_df[['HW_Address', 'payload', 'ONT_Barcode']]

In [None]:
from data_functions import get_cleaned_encoded_file

In [None]:
t = get_cleaned_encoded_file(encoded_df, address=True)

In [None]:
enc

In [None]:
duble_filter.rename(columns={"barcode_1": "ONT_Barcode", "barcode_2": "HW_Address"}, inplace=True)

In [None]:
duble_filter = duble_filter.loc[~duble_filter['ONT_Barcode'].str.startswith('unclassified')]

In [None]:
duble_filter['ONT_Barcode'] = duble_filter['ONT_Barcode'].apply(lambda x: int(x))

In [None]:
duble_filter

In [None]:
enc['ONT_Barcode']

In [None]:
merged_df = pd.merge(duble_filter, enc, on=['HW_Address', 'ONT_Barcode'])

In [None]:
merged_df

In [None]:
# Now load fast5s

from ont_fast5_api.fast5_interface import get_fast5_file

def get_data_from_fast5(fast5_filepath, selected_read_ids=None):
    raw_data_arr = []
    read_ids = []
    with get_fast5_file(fast5_filepath, mode="r") as f5:
        for read in f5.get_reads():
            raw_data = read.get_raw_data()
            raw_data_arr.append(raw_data)
            read_ids.append(read.read_id)
    return raw_data_arr, read_ids

In [None]:
selected_read_ids = merged_df['read_id'].to_list()

squiggle_df = {i: None for i in read_ids}

In [None]:
print(selected_read_ids)

In [None]:
for file in filenames:
    raw_data_arr, read_ids_loc = get_data_from_fast5(os.path.join(run_filepath, file))
    print(len(set(read_ids_loc).intersection(set(selected_read_ids))))
    print(read_ids_loc)

    for ind, id in enumerate(read_ids_loc):
        squiggle_df[id] = raw_data_arr[ind]

In [None]:
raw_data_arr

In [None]:
squiggle_df

### 01-13-EXP2-REP3

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
basepath = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\sequencing_runs\01-13"

In [None]:
motif_search_df = pd.read_csv(os.path.join(basepath, "full_motif_search.csv"))

In [None]:
hw_df = pd.read_csv(os.path.join(basepath, "HW_demultiplexing_summary.csv"))
ont_df = pd.read_csv(os.path.join(basepath, "ONT_demultiplexing_summary.csv"))

In [None]:
hw_df = hw_df[['barcode_arrangement', 'read_id']]

In [None]:
hw_df.rename(columns={'barcode_arrangement': "HW_Address"}, inplace=True)

In [None]:
ont_df = ont_df[['barcode_arrangement', 'read_id']]

In [None]:
ont_df.rename(columns={'barcode_arrangement': 'ONT_Barcode'}, inplace=True)

Merging demultiplexed into motif search

In [None]:
merged_df = pd.merge(ont_df, hw_df, on='read_id')

In [None]:
merged_df = pd.merge(merged_df, motif_search_df, on='read_id')

Getting rid of unclassified reads

In [None]:
merged_df = merged_df.loc[~merged_df['ONT_Barcode'].str.startswith('unclassified')]

In [None]:
merged_df = merged_df.loc[~merged_df['HW_Address'].str.startswith('unclassified')]

In [None]:
merged_df['ONT_Barcode'].value_counts()

In [None]:
encoded_csv = pd.read_csv(os.path.join(basepath, 'HELIX01-13-W1_encoded.tsv'), sep='\t')

In [None]:
from data_functions import get_cleaned_encoded_file

In [None]:
encoded_csv_ = get_cleaned_encoded_file(encoded_csv, address=True)

In [None]:
merged_df['ONT_Barcode'].unique()

In [None]:
encoded_csv_['ONT_Barcode'] = encoded_csv_['ONT_Barcode'].apply(lambda x: f"barcode0{x}" if x < 9 else f"barcode{x}")

In [None]:
combined_df = pd.merge(merged_df, encoded_csv_, on=['HW_Address', 'ONT_Barcode'])

In [None]:
combined_df.to_pickle(os.path.join(basepath, "address_encoded.pkl"))

In [None]:
t = combined_df.loc[combined_df['ONT_Barcode'] == 'barcode58']

In [None]:
t['HW_Address'].value_counts()

In [None]:
hw_addresses = [
    'barcode_external08_internal04',
    'barcode_external06_internal07',
    'barcode_external06_internal06',
    'barcode_external05_internal06',
    'barcode_external07_internal07'
]

In [None]:
t = t.loc[t['HW_Address'].isin(hw_addresses)]

In [None]:
t.to_csv(os.path.join(basepath, "5add_encoded.csv"))

### 01-04 splitting

In [None]:
import pandas as pd
import os

In [None]:
basepath = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\sequencing_runs\01-04run\finetuning_datasets"

In [None]:
df = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\sequencing_runs\01-04run\finetuning_datasets\edit_train.pkl")

In [None]:
forward_df = df.loc[df['strand'].str.startswith('+')]

In [None]:
reverse_df = df.loc[df['strand'].str.startswith('-')]

In [None]:
forward_df.to_pickle(os.path.join(basepath, 'edit_forward.pkl'))

In [None]:
reverse_df.to_pickle(os.path.join(basepath, 'edit_reverse.pkl'))

### Lower concentration EIC04

In [None]:
import pandas as pd
import numpy as np

In [None]:
ms = pd.read_csv(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\motif_search_barcoded.csv")

In [None]:
ms['ONT_Barcode'] = ms['ONT_Barcode'].apply(lambda x: int(x[-2:]))

In [None]:
ms['ONT_Barcode'].value_counts()

In [None]:
t1_barcodes = np.arange(1, 80, 4)
t2_barcodes = np.arange(2, 80, 4)
t3_barcodes = np.arange(3, 80, 4)
t4_barcodes = np.arange(4, 81, 4)

In [None]:
t2_df = ms.loc[ms['ONT_Barcode'].isin(t2_barcodes)]
t3_df = ms.loc[ms['ONT_Barcode'].isin(t3_barcodes)]
t4_df = ms.loc[ms['ONT_Barcode'].isin(t4_barcodes)]

In [None]:
selected_barcodes = [80, 76, 48, 68, 79, 75, 47, 67, 78, 74, 46, 66]

In [None]:
t2_encoded = pd.read_csv(r"C:\Users\Parv\Downloads\EIC01-01-1280-T2_encoded.tsv", sep='\t')
t3_encoded = pd.read_csv(r"C:\Users\Parv\Downloads\EIC01-01-1280-T3_encoded.tsv", sep='\t')
t4_encoded = pd.read_csv(r"C:\Users\Parv\Downloads\EIC01-01-1280-T4_encoded.tsv", sep='\t')

In [None]:
from data_functions import get_cleaned_encoded_file

In [None]:
t2_encoded = get_cleaned_encoded_file(t2_encoded, address=False)
t3_encoded = get_cleaned_encoded_file(t3_encoded, address=False)
t4_encoded = get_cleaned_encoded_file(t4_encoded, address=False)

Selected barcodes - 80, 76, 48, 68 (4)
                    79, 75, 47, 67 (3)
                    78, 74, 46, 66 (2)

Get encoded for each and filter out those barcodes, that's your final file - its not a lot of reads to be fair, maybe 10k ish

Extract the same from the fastq, split into files and get edit-search outputs

And use the df to get the squiggles and the same from the cluster (but use a bigger df rather than 4 of them)

In [None]:
ms = ms.loc[ms['ONT_Barcode'].isin(selected_barcodes)]

In [None]:
merged_1 = pd.merge(ms, t2_encoded, on=['HW_Address', 'ONT_Barcode'])

In [None]:
merged_2 = pd.merge(ms, t3_encoded, on=['HW_Address', 'ONT_Barcode'])

In [None]:
merged_3 = pd.merge(ms, t4_encoded, on=['HW_Address', 'ONT_Barcode'])

In [None]:
t = pd.concat([merged_1, merged_2, merged_3])

In [None]:
t.to_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\full_datasets\diluted_EIC04.pkl")

In [None]:
diluted = pd.read_pickle(r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\empirical\full_datasets\diluted_EIC04.pkl")