## Sequencing Experiment Design
Comparing capping and no capping for a coupling rate of 0.99.

In [8]:
from Bio import SeqIO
from seq_stat import align
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from aligned_clustering import conduct_align_clustering
from utils import get_original_strands, read_synthesized_strands_from_file
import random
import uuid

In [5]:
# Loading original strands and synthesized strands

original_strands_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-10 09.39.03.129913\original_strands.txt"
synthesized_strands_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-10 09.39.03.129913\synthesized.fasta"

# Read original strands from the file
original_strand_ids, coupling_rates, capping_flags, original_strands = get_original_strands(original_strand_filepath=original_strands_filepath)

# Read synthesised strands from file - 360,000 of these
synthesized_strands, synthesized_strand_ids = read_synthesized_strands_from_file(synthesized_strands_filepath)


In [24]:

# Creating padded file for Badread

def generate_random_bases(n_bases):

    bases = ['A', 'C', 'T', 'G']
    return "".join([random.choice(bases) for i in range(n_bases)])
    

def create_badread_data(synthesized_strands, synthesized_strand_ids, write_filename='padded_synth.fasta'):

    random.shuffle(synthesized_strands)
    synthesized_padded_dict = {}
    with open(write_filename, 'w') as f:
        for strand, base_id in zip(synthesized_strands, synthesized_strand_ids):
            
            strand = generate_random_bases(200) + strand
            unique_id = str(uuid.uuid4())
            f.write(f">{id}, {unique_id}\n")
            f.write(strand + '\n\n')
            synthesized_padded_dict[unique_id] = strand

In [25]:
create_badread_data(synthesized_strands=synthesized_strands, synthesized_strand_ids=synthesized_strand_ids)

In [26]:

# Post Badread data processing

def parse_biopython(input_fastq):
    for record in SeqIO.parse(input_fastq, 'fastq'):
        yield record
        
def postprocess_sequencing_data(fastq_filepath, original_strand_ids, original_strands):
    """
    The record description contains the strand starting, ending and orientation
    """
    sequenced_strands = []
    for i, record in tqdm(enumerate(parse_biopython(fastq_filepath))):
        strand_id = record.description.split()[1].split(',')[0]
        if strand_id in original_strand_ids:
            target_strand = original_strands[original_strand_ids.index(strand_id)]
        else:
            continue
        #aligned, identity = align(target_strand, record.seq)

        
        #if identity > 0.7:
        #   sequenced_strands.append(str(aligned.replace('-', '')))

        sequenced_strands.append(str(record.seq))

    return sequenced_strands


In [28]:
fastq_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\reads.fastq\reads.fastq"
sequenced_strands = postprocess_sequencing_data(fastq_filepath=fastq_filepath, original_strand_ids=original_strand_ids, original_strands=original_strands)

0it [00:00, ?it/s]

In [39]:

# Clustering 

def cluster_data(original_strands, sequenced_strands):
    recoveries = conduct_align_clustering(
            original_strand=original_strands,
            trimmed_seqs=sequenced_strands,
            display=False,
            multiple=True
        )
    return list(recoveries['recoveries'].values())

# Post Clustering Analysis

def post_process_results(recoveries_strands, capping_flags, coupling_rates):

    columns = [
    'capping',
    'coupling_rate',
    'pool_recovery'
    ]

    df = pd.DataFrame(np.array([capping_flags, coupling_rates, recoveries_strands]).T, columns=columns)
    df_capping = df.loc[df['capping'] == 'True']
    df_no_capping = df.loc[df['capping'] ==  'False']

    df_capping = df_capping.drop(['capping'], axis=1)
    df_no_capping = df_no_capping.drop(['capping'], axis=1)

    return df_capping, df_no_capping

In [36]:
sampled_sequenced_strands = random.sample(sequenced_strands, 1000)
recoveries_strands = cluster_data(original_strands=original_strands, sequenced_strands=sampled_sequenced_strands)
df_capping, df_no_capping = post_process_results(recoveries_strands=recoveries_strands, capping_flags=capping_flags, coupling_rates=coupling_rates)
print(df_capping)
print(df_no_capping)

NameError: name 'post_process_results' is not defined