## Sequencing Experiment Design
Comparing capping and no capping for a coupling rate of 0.99.

In [1]:
from Bio import SeqIO
from seq_stat import align
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from aligned_clustering import conduct_align_clustering
from utils import get_original_strands, read_synthesized_strands_from_file
import random
import uuid

%load_ext autoreload
%autoreload 2


Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [2]:
# Loading original strands and synthesized strands

original_strands_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\data\multiple_cr_post_seq_reads_badread\original_strands.txt"
synthesized_strands_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\data\multiple_cr_post_seq_reads_badread\synthesized.fasta"

# Read original strands from the file
original_strand_ids, coupling_rates, capping_flags, original_strands = get_original_strands(original_strand_filepath=original_strands_filepath)

# Read synthesised strands from file - 360,000 of these
synthesized_strands, synthesized_strand_ids = read_synthesized_strands_from_file(synthesized_strands_filepath)


In [3]:

# Creating padded file for Badread

def generate_random_bases(n_bases):

    bases = ['A', 'C', 'T', 'G']
    return "".join([random.choice(bases) for i in range(n_bases)])
    

def create_badread_data(synthesized_strands, synthesized_strand_ids, write_filename='padded_synth.fasta'):

    random.shuffle(synthesized_strands)
    synthesized_padded_dict = {}
    with open(write_filename, 'w') as f:
        for strand, base_id in zip(synthesized_strands, synthesized_strand_ids):
            
            #strand = generate_random_bases(200) + strand
            unique_id = str(uuid.uuid4())
            f.write(f">{unique_id}\n")
            f.write(strand + '\n\n')
            synthesized_padded_dict[unique_id] = strand

    return synthesized_padded_dict

In [4]:
synthesized_strands_sampled = random.sample(synthesized_strands, 1000)
synthesized_padded_dict = create_badread_data(synthesized_strands=synthesized_strands, synthesized_strand_ids=synthesized_strand_ids, write_filename='synth_unpadded.fasta')

# Need to be saving this dict - fuck me

In [5]:

# Post Badread data processing

def parse_biopython(input_fastq):
    for record in SeqIO.parse(input_fastq, 'fastq'):
        yield record
        
def postprocess_sequencing_data(fastq_filepath, original_strand_ids, original_strands, synthesized_padded_dict=None, reverse_oriented=False, filter=False):
    """
    The record description contains the strand starting, ending and orientation
    """
    sequenced_strands = []
    for i, record in tqdm(enumerate(parse_biopython(fastq_filepath))):

        strand_id = record.description.split()[1].split(',')[0]
        strand = str(record.seq)

        if reverse_oriented:
            ## Correcting orientation if it is wrong
            try:
                orientation = record.description.split()[1].split(',')[2]
                if orientation == '-strand':
                    strand = strand[::-1]
            except:
                continue

        # Aligning to the target strand if we are filtering        
        if filter:
            if strand_id in synthesized_padded_dict.keys():
                    target_strand = synthesized_padded_dict[strand_id]
            else:
                continue

            aligned, identity, indices = align(target_strand, strand)

            if identity > 0.7:
                sequenced_strands.append(strand)
        else:
            sequenced_strands.append(strand)

    return sequenced_strands


In [6]:

fastq_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\data\multiple_cr_post_seq_reads_badread\reads.fastq"
sequenced_strands = postprocess_sequencing_data(fastq_filepath=fastq_filepath, original_strand_ids=original_strand_ids, original_strands=original_strands, filter=False, reverse_oriented=True)
print(f"Number of sequenced strands = {len(sequenced_strands)}")

0it [00:00, ?it/s]

Number of sequenced strands = 152831


In [10]:

# Clustering 

def cluster_data(original_strands, sequenced_strands):
    recoveries = conduct_align_clustering(
            original_strand=original_strands,
            trimmed_seqs=sequenced_strands,
            multiple=True
        )
    return list(recoveries['recoveries'].values())

# Post Clustering Analysis

def post_process_results(recoveries_strands, capping_flags, coupling_rates):

    columns = [
    'capping',
    'coupling_rate',
    'pool_recovery'
    ]

    df = pd.DataFrame(np.array([capping_flags, coupling_rates, recoveries_strands]).T, columns=columns)
    df_capping = df.loc[df['capping'] == 'True']
    df_no_capping = df.loc[df['capping'] ==  'False']

    df_capping = df_capping.drop(['capping'], axis=1)
    df_no_capping = df_no_capping.drop(['capping'], axis=1)

    return df_capping, df_no_capping

In [8]:
sequenced_strands_sampled = random.sample(sequenced_strands, 15000)
len(sequenced_strands_sampled)

15000

In [11]:

recoveries_strands = cluster_data(original_strands=original_strands, sequenced_strands=sequenced_strands_sampled)
df_capping, df_no_capping = post_process_results(recoveries_strands=recoveries_strands, capping_flags=capping_flags, coupling_rates=coupling_rates)
print(df_capping)
print(df_no_capping)

   coupling_rate pool_recovery
0            0.9         0.345
2            0.9          0.34
4            0.9         0.335
6          0.925         0.335
8          0.925         0.335
10         0.925         0.335
12          0.95         0.335
14          0.95          0.34
16          0.95          0.33
18         0.975         0.355
20         0.975          0.35
22         0.975         0.455
24          0.99         0.595
26          0.99         0.405
28          0.99         0.365
   coupling_rate pool_recovery
1            0.9         0.335
3            0.9         0.355
5            0.9          0.35
7          0.925         0.415
9          0.925         0.335
11         0.925         0.325
13          0.95          0.33
15          0.95          0.34
17          0.95         0.415
19         0.975         0.995
21         0.975          0.92
23         0.975          0.93
25          0.99           1.0
27          0.99         0.995
29          0.99         0.675


In [None]:
"""
1. Generate new file with new ids
2. Collect Badread sequencing run
3. Sampling statistics to understand how reads are being generated
4. Clean orientation and make a proper strand pool to cluster
5. Complete Experiment with growing repeats
"""