In [41]:
import random
import numpy as np
import pandas as pd
from itertools import combinations
from tqdm.notebook import tqdm
import uuid
import matplotlib.pyplot as plt
import json

In [46]:
def create_random_strand(len_strand):
    choices = ['A', 'C', 'T', 'G']
    return "".join([random.choice(choices) for i in range(len_strand)])

def create_spacer_sequence(cycles):
    """ Create motif level label for the model"""

    spacer_sequence = []

    cycle_number = 9
        
    for i in cycles:
        for j in i:
            spacer_sequence.append(cycle_number)
            spacer_sequence.append(j)
            spacer_sequence.append(cycle_number)
        cycle_number += 1

    return spacer_sequence

def reverse_complement(dna: str) -> str:
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return ''.join(complement[base] for base in reversed(dna))

In [3]:

# First let us see if squigulator lets us seperate out into cycle reads, cause that will make life much simpler

n_motifs_per_read = 32
n_unique_motifs = 8
n_spacer_motifs = 8
motifs_per_payload = 4
len_motif = 20
len_link = 10
n_cycles = 8

motif_choices = [create_random_strand(len_motif) for i in range(n_unique_motifs)]
link_choices = [create_random_strand(len_link) for i in range(n_spacer_motifs)]
motif_indices = [1, 2, 3, 4, 5, 6, 7, 8]

In [4]:
n_reads = 20000
reads_base_level = []
reads_motif_level = []
motif_labels = []
motif_picks = list(combinations(motif_indices, 4)) # Storing all possible combinations to draw from

for read in tqdm(range(n_reads)):
    read_base_level = ""
    read_motif_level = []
    for cycle_position in range(n_cycles):
        payload_motifs = random.choice(motif_picks)
        payload_read_base_level = "".join([f"{link_choices[cycle_position]}{motif_choices[i-1]}{link_choices[cycle_position]}" for i in payload_motifs])
        read_base_level += payload_read_base_level
        read_motif_level.append(payload_motifs)
    reads_base_level.append(read_base_level)
    reads_motif_level.append(read_motif_level)
    motif_label = create_spacer_sequence(read_motif_level)
    motif_labels.append(motif_label)

  0%|          | 0/20000 [00:00<?, ?it/s]

In [8]:
ids = [str(uuid.uuid4()) for i in range(n_reads)]

In [37]:
experiment_dict = {
    "motif_choices": motif_choices, 
    "link_choices": link_choices,
    "motif_labels": motif_labels,
    "read_ids": ids
}

# write it as a json file to the data path to make sure the run information is validated

In [9]:
# writing to fasta file

base_filepath = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\squigulator\split.fa"

with open(base_filepath, "w") as f:
    for ind in range(len(ids)):
        f.write(f">>{ids[ind]}\n")
        f.write(reads_base_level[ind] + "\n\n")

In [None]:
base_filepath = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\squigulator\split.fa.txt"
with open(base_filepath, 'r') as f:
    fasta_seq = f.readlines()[1]


### Loading fast5 data after generating via Squigulator

In [11]:

from ont_fast5_api.fast5_interface import get_fast5_file

def get_data_from_fast5(fast5_filepath):
    raw_data_arr = []
    read_ids = []
    with get_fast5_file(fast5_filepath, mode="r") as f5:
        for read in f5.get_reads():
            raw_data = read.get_raw_data()
            raw_data_arr.append(raw_data)
            read_ids.append(read.read_id)
    return raw_data_arr, read_ids


In [12]:
fast5_filepath = r"C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\synthetic\fast5_post_squigulator\output.fast5"
raw_data_arr, read_ids = get_data_from_fast5(fast5_filepath)

In [14]:
print(ids)

['b6b3dd56-85b9-45d9-aeae-1b8c4bd025be', 'ba8a7f5f-6e36-475d-a49f-47f2d0fa8df4', 'e988831d-1f03-49c9-b487-d624a2eb56c7', '014c8614-e885-44ae-aba1-fdbe54ff8433', '9930329e-78d1-491d-806b-e547d709618b', 'c1bccc77-be81-4ddc-848f-9891a0e78b17', '095d59b7-732c-40f2-9334-f9d6a9f89ba3', 'd1a8b4bb-4835-4e84-896d-e2808f76d8d6', 'cc306bc6-7a2d-4f5e-8b08-9f2d1c1e4684', '083a4382-d95c-453f-ac51-40f11bf13dff', '548151ef-3e81-45f4-9781-025acec3299d', '89bb8d0d-46d7-49b5-8f8c-0fb8e01114a2', 'e439e3cc-5e3b-42cd-b262-a4753b630ff0', 'cbd5065d-5c74-4684-a05d-8c03572a66a6', '9c719cc9-9664-47f1-b168-82aadca51630', '037e84a0-4bdf-45e9-8f58-16ff2aabb829', '3566efe8-3e9d-472c-892e-7ccd646e0d5a', 'f3bee769-cad2-4b9f-af6c-3c0e517cb2e3', 'f553bdc7-ab89-4da9-a1ad-ee0b8e11b3d0', '4628708a-c972-43e3-b3fc-1f6711211ad1', '12ccba3a-a30c-4f71-90c1-eb209e8674fa', 'dc2c902f-ed3a-4153-9d0b-ed123a490342', 'dabfe339-1be8-4bf0-b794-906840f4df0e', '5c85a387-710b-4817-98d3-2d833b08cb03', 'a0235cf5-60f2-4719-a0e3-5c51fbdb88e1',

In [73]:

def add_remainder_motif(ptr):

    # Adding the motif if majority of it is present
    # Hardcoded, spacer is 10 and payload is 20 S - P - S
    if ptr < 21 and ptr > 5:
        return 1
    elif ptr < 36  and ptr >= 21:
        return 2
    elif ptr < 40 and ptr >= 36:
        return 3
    return 0

def get_motif_sequence(starting_pos, ending_pos, motif_read):

    # Mod by 40 to get paylaod number
    starting_index = int(starting_pos / 40)
    ptr = starting_pos
    starting_index *= 3

    ptr %= 40

    starting_index += add_remainder_motif(ptr)

    # Mod by 40 to get paylaod number
    ending_index = int(ending_pos / 40)
    ptr = ending_pos
    ending_index *= 3

    ptr %= 40

    ending_index += add_remainder_motif(ptr)

    print(starting_index, starting_pos)
    print(ending_index, ending_pos)
    print()

    return motif_read[starting_index: ending_index]


In [82]:
from sklearn.preprocessing import normalize

In [94]:

base_sequences_dataset = []
motif_sequences_dataset = []
raw_data_arrs = []

for ind, read_id in tqdm(enumerate(read_ids)):
    split_id = read_id.split('!') # To get the starting and ending character positions for the specific read id
    uid = split_id[1][1:]
    reverse_complemented = split_id[4] == '-'
    starting_pos, ending_pos = int(split_id[2]), int(split_id[3])
    index = ids.index(uid)
    base_sequence_dataset = reads_base_level[index][starting_pos: ending_pos]
    motif_sequence_dataset = get_motif_sequence(starting_pos, ending_pos, motif_labels[index])
    
    
    if reverse_complemented:
        continue
    
    base_sequences_dataset.append(base_sequence_dataset)
    motif_sequences_dataset.append(motif_sequence_dataset)
    raw_data_arrs.append(raw_data_arr[ind])


0it [00:00, ?it/s]

28 378
96 1280

68 911
96 1280

17 231
96 1280

36 477
96 1280

35 475
96 1280

0 5
96 1280

65 866
96 1280

70 932
96 1280

22 289
96 1280

29 382
96 1280

36 480
96 1280

52 687
96 1280

67 894
96 1280

31 414
96 1280

43 579
96 1280

20 268
96 1280

77 1027
96 1280

54 717
96 1280

48 636
96 1280

28 376
96 1280

38 514
96 1280

65 869
96 1280

49 657
96 1280

3 38
96 1280

71 941
96 1280

21 277
96 1280

24 319
96 1280

80 1068
96 1280

62 834
96 1280

42 561
96 1280

74 995
96 1280

13 172
96 1280

55 732
96 1280

57 757
96 1280

14 182
96 1280

78 1039
96 1280

15 198
96 1280

19 251
96 1280

4 48
96 1280

51 676
96 1280

28 369
96 1280

31 416
96 1280

20 265
96 1280

68 911
96 1280

67 900
96 1280

45 602
96 1280

76 1011
96 1280

1 19
42 557

66 877
96 1280

43 572
96 1280

17 221
96 1280

19 251
96 1280

5 71
96 1280

4 57
96 1280

81 1079
96 1280

25 339
96 1280

78 1036
96 1280

70 931
96 1280

69 916
96 1280

5 69
96 1280

31 412
96 1280

47 633
96 1280

76 1008
96 1280

2

In [95]:
df = pd.DataFrame()

In [96]:
df['motif_seq'] = motif_sequences_dataset
df['base_seq'] = base_sequences_dataset
df['squiggle'] = raw_data_arrs

In [98]:
df.to_pickle(r'C:\Users\Parv\Doc\HelixWorks\Basecalling\code\motifcaller\data\synthetic\pickled_datasets\25_2_25.pkl')