### Cyclic Redundancy Check Code (CRC32)
Let's add and check for simple strands and then run an experiment through Badread

In [36]:
import zlib
import random
from typing import List
import uuid

In [37]:
base_mapping = {'A': '00', 'C': '01', 'G': '10', 'T': '11'}
int_mapping = {'00': 'A', '01': 'C', "10": 'G', "11": 'T'}

In [38]:

def convert_dna_to_byte_string(strand):
    binary_str = "".join([base_mapping[i] for i in strand])

    padding = (8 - len(binary_str) % 8) % 8
    binary_str = binary_str + "0" * padding  # pad with zeros at the end
    
    byte_string = int(binary_str, 2).to_bytes(
        len(binary_str) // 8, byteorder="big")

    return byte_string

In [39]:
def generate_random_strand(strand_length: int):
    return "".join([random.choice(
        ['A', 'C', 'T', 'G']) for i in range(strand_length)])

In [40]:
byte_string = convert_dna_to_byte_string(generate_random_strand(300))

In [41]:
crc_code = zlib.crc32(byte_string)

In [42]:
def convert_integer_to_dna(num):
    binary_str = bin(num)[2:].zfill(32)
    if len(binary_str) % 2 != 0:
        binary_str = "0" + binary_str
    return "".join(int_mapping[binary_str[i:i+2]] for i in range(0, len(binary_str), 2))


In [43]:
# Generate random strand
# Convert to byte string
# Get CRC32
# Convert CRC32 to DNA
# Append to end of DNA

strand_length = 300


strand = generate_random_strand(strand_length)

def get_crc_strand(strand):
    crc = zlib.crc32(convert_dna_to_byte_string(strand))
    dna_crc = convert_integer_to_dna(crc)
    final_strand = strand + dna_crc
    return final_strand

In [44]:
def create_fasta_file(ids: List[str], strands: List[str], output_filepath: str):
    with open(output_filepath, 'w') as f:
        for i, strand in enumerate(strands):
            f.write(f">{ids[i]}\n")
            f.write(strand + '\n\n')

    print(f"File saved as {output_filepath}")

In [45]:
strands = [get_crc_strand(generate_random_strand(strand_length=200)) for i in range(100)]

In [46]:
ids = [str(uuid.uuid4()) for i in range(len(strands))]

In [47]:
create_fasta_file(ids, strands, 'data/crc_strands.fasta')

File saved as data/crc_strands.fasta


### Clustering

In [14]:
from utils import read_synthesized_strands_from_file

In [50]:
original_strands = read_synthesized_strands_from_file('data/crc_strands.fasta')[0]

In [51]:
len(original_strands)

100

In [54]:
import Levenshtein

In [55]:
from clustering import Clustering

In [56]:
from utils import get_fastq_records

In [77]:
records = get_fastq_records('data\crc_small.fastq')

2269it [00:00, 43025.99it/s]


In [78]:
strands = [str(i.seq) for i in records]
ids = [i.id for i in records]

In [117]:
clustering_obj = Clustering(strand_pool=strands, reference_length=216, n_reference_strands=100, distance_threshold=20)

In [118]:
clustering_obj.run_pipeline(fix_orientation=True)

Clustering strands
Total strands 2269


100%|██████████| 2269/2269 [00:53<00:00, 42.76it/s]


Number of clusters = 230
Clusters are sorted
Orientation fixed in the strand pool
Generating 200 candidates


 23%|██▎       | 46/200 [00:07<00:25,  6.03it/s]


KeyboardInterrupt: 

In [110]:
from strand_reconstruction import make_prediction
from tqdm import tqdm
from utils import reverse_complement

In [109]:
def validate_crc(strand, info_length=200):
    return get_crc_strand(strand[:info_length]) == strand

In [120]:

def generate_candidates_crc_validated(
        clustered_seqs, n_clusters=200, n_attempts=3,
        strand_length=200, ma_sample_size=10):
    
    validated_strands = []
    for ind, i in enumerate(clustered_seqs[:n_clusters]):  # Iterate through clusters
        # Can be RC remember!
        for k in range(3): # Repeat n_attempts time
            candidate = make_prediction(i, sample_size=ma_sample_size)  # Make candidate prediction
            rev = reverse_complement(candidate)  # Obtain RC vector
            if validate_crc(candidate):  # Validate CRC code for forward and rc prediction
                validated_strands.append(candidate)
                break
            elif validate_crc(rev):
                validated_strands.append(rev)
                break
            else:
                continue
    
    validated_strands = list(set(validated_strands))
    print(f"{len(validated_strands)} valid strands found")

    return validated_strands

In [121]:
validated_strands = generate_candidates_crc_validated(clustering_obj.clustered_seqs)

100 valid strands found


In [122]:
len(set(validated_strands))

100

In [123]:
len(set(validated_strands).intersection(original_strands))

100

In [84]:
original_strands_no_crc = [i[:200] for i in original_strands]

In [85]:
candidates = clustering_obj.candidates

In [86]:
validated_strands = []

In [92]:
for i in candidates:
    if get_crc_strand(i[:200]) == i:
        validated_strands.append(i[:200])

In [90]:
len(set(validated_strands))

44

In [None]:
valid

In [94]:
len(set(original_strands_no_crc).intersection(validated_strands))

44

In [70]:
clustering_obj.clustered_seqs[0]

['TCAGTTACGTATTGCTAAAGATCGCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTGCCCAAGGTGCGGTTCTCGCGTCATAATCACGTGCCAATTTCCTCCGGACATCCAAAATTTAAATTTTAAGCCGTAACTGCGAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTACCACATTCTTCAGACATCGCCTTTATTTACCAA',
 'TACTTTGTTCAGTTACGTATTGCTAAAGATCGTCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTGCCCAAGGTGCGGTTCTCGCGTACATATCACGTGCCAATTTCCTCCGGACATCCAAAATTTAATTTTTAAGCCGTAACTGCGAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTGCCGCATTCTTCAGGACATCGCCTTTATTTACCAAGC',
 'AAAGATCGTCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTGCCCAAGGTGCGGTTCTCGCGTACATATCACGTGCCAATTTCCTCCGGACATCCAAAATTTAAATTTTAAGCCGTAACCGCCAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTATCACATTCTTTAGGACATCGCCTTTATTTACCAA',
 'AAAGATCGTCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTTGCCCAAGGTGCGGTTCTCGCGTACCATATCACGTGCCAATTTCCTCCGGACATTGAAAATTTAAATTTTAAGCCGTAACTGCGAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTACCATATTCTTCAGGACATCGCCTTTATTTACCAAAGCAATGC',
 'TATTGCTAAAGATCGTCCAGCCCATTTGGAGTCGGGTTGCTACTCCAACGGGATGGTATGGCCC

In [69]:
clustering_obj.generate_candidates(n_candidates=10, fix_orientation=True)

100%|██████████| 10/10 [00:01<00:00,  5.01it/s]

Fixing candidate orientations
0.0 candidates are reversed





['TTACGTATTGCTAAAGATCGTCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTGCCCAAGGTGCGGTTCTCGCGTACATATCACGTGCCAATTTCCTCCGGACATCCAAAATTTAAATTTTAAGCCGTAACTGCGAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTACCACATTCTTCAGGACATCGCCTTTATTTACCAA',
 'AGTTACGTATTGCTTTGCTTATGGATGTTGAGGTCCTTGCGAATATCCACTCTAAAGTTGTTCTCCCCCAATCGCGGAATCTCAAAAACTAGTTGGCCATGCTGAGCCATACCTCACCTACATAATAGGTGACACGTTATCGAACCAGACCCACACTTCACTCATCATTCGTATTAGTGGCACCTTTTTAAGGCGGAGAGACGTAACCCAGCGTTGCCTACGTCACTAAGAGCAATAA',
 'TCTTCAGTTACGTATTGCTATACTTTCATCTCATTAACCGATGTCGCATAGATAGATGAGCTCTGAGAGTGATCGTCAGGTCAATATCGCGGTGGGATGGAGGTTGCATATATATTTAAGGCAAAGCAGATGCACCCCCTGACAAGACTTCAATATAAGGCGATTCTGGCTTTACGAACGCACCCTTTGTTAACACTCAAGTCGGTACATCTAGAGCGGTCCATCCACGATTAAAGCAA',
 'TTCGTTCAGTTACGTATTGCTGGAGCTGTGCTTCTGCTGGCTTCTCATTCTTAGGCCGCTGTCTTGCGCGGAAATGCTGCCATTGAACGGCGAACTCTAGGGTAACAATACTTCGCTGAAGGCCGTGGGGCGCGATTCGGGACTGAAAAGTGCCTACGTCTTCAGAGTCCATCGTTTAATCGATGAAGGCAAGAGAGGTCAACCGGCATTAAAACCGAAGTTCATGCGATGATCTCGGCAATACGTG',
 'GTTCAGTTACGTATTGCTGGACTTCC

In [66]:
clustering_obj.reversed_markers

array([False, False, False, ...,  True, False, False], shape=(2179,))

In [63]:
clustering_obj.clustered_seqs[0]

['TCAGTTACGTATTGCTAAAGATCGCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTGCCCAAGGTGCGGTTCTCGCGTCATAATCACGTGCCAATTTCCTCCGGACATCCAAAATTTAAATTTTAAGCCGTAACTGCGAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTACCACATTCTTCAGACATCGCCTTTATTTACCAA',
 'TACTTTGTTCAGTTACGTATTGCTAAAGATCGTCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTGCCCAAGGTGCGGTTCTCGCGTACATATCACGTGCCAATTTCCTCCGGACATCCAAAATTTAATTTTTAAGCCGTAACTGCGAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTGCCGCATTCTTCAGGACATCGCCTTTATTTACCAAGC',
 'AAAGATCGTCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTGCCCAAGGTGCGGTTCTCGCGTACATATCACGTGCCAATTTCCTCCGGACATCCAAAATTTAAATTTTAAGCCGTAACCGCCAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTATCACATTCTTTAGGACATCGCCTTTATTTACCAA',
 'AAAGATCGTCAGCCCATTTGGAGTCGGGTTGTACTCCAACGGGATGGTACTTGCCCAAGGTGCGGTTCTCGCGTACCATATCACGTGCCAATTTCCTCCGGACATTGAAAATTTAAATTTTAAGCCGTAACTGCGAGAGCTTTGACGTCGCGCCAGGCCAGCAACGTCATCCTAATCTTACCTACCATATTCTTCAGGACATCGCCTTTATTTACCAAAGCAATGC',
 'TATTGCTAAAGATCGTCCAGCCCATTTGGAGTCGGGTTGCTACTCCAACGGGATGGTATGGCCC

In [62]:
len(clustering_obj.candidates[0])

243

In [None]:
clustering_obj.clustered_seqs

[['ATGTACTTCGTTCAGTTACGTATTGCTAGGGCAGTTGCACTACATCACAGGTCCGCGCATCGCAATAATTGTACAAGAGCTGGTTCTCGTCGCGACATCC',
  'ATGTACTTCGTTTCAGTTACGTATTGCTAGGAATGTTCTTTCGTCTCTAATGCAGGTCCGTTCGAGCCCGACAAGTTAGCATCAGAGCAATTTAATCACT',
  'TGTACTTCGTTCAGTTACGTATTGCTGGGGAAGCCAGCACGACATTTTGCATTTGTCACGAATTGTGGTATGGGTATATTGCTTGTCCAGCTCGACCCGC',
  'GTACTTCGTTCAGTTACGTATTGCTAGTGGTCTGATAATGCGCTGCCTCATCTGCGAATAAAGAAGTAGGGAACAATGTATCAGGTTCAAACACACATAC',
  'ATGTTACTTCGTTCAGTTACGTATTGCTAAAGGTCTTCGCCTCAAGGGTAGGTTCAGGGCGCTTCACGCAAGTCACTGGCGTTTGTCGGGTGTACATTGA',
  'TGTACTTCGTTCATTTACGTATTGCTGTCATATTGAATACTAAGTTTACGGGTTATCTCATTCATGACAGCGGCACACATAGTTATTTATCGACCGTTGC',
  'TACTTCGTTCAGATACGTATTGCTGAGGTAGTAATGCCACTCCCTGGTAACGCAACGTACAGCAGAGTGTAATAGAGTTAAGGTTCTGGCGACACGAGTG',
  'ATGTACTTCGTTCAGTTACGTATTGCTGCGTTGACTAGCTAACAGTTAGGTAGAATGCGTACAACGATTAGAAAACCATTTCCAGGCCAGACCACGAGTA',
  'ATGTTACTTCGTTCAGTTACGTATTGCTCGGGCAATTCGAGTGGAATGCCAAGTCTGTATAATGGATGAATAAGCACTTGTAGGCCCCGAAATCTCTGAA',
  'TACTTCGTTCAGTTACGTATTGCTGGCAGATGCAGGCAGACCC

In [None]:
clust

In [33]:
clustering_obj.clusters

AttributeError: 'Clustering' object has no attribute 'clusters'