## Aligning experimental reads from the sequencing runs

Want to see how my alignment pipeline does at recovering the reads, so I can understand the types of errors that are present

In [16]:
%load_ext autoreload
%autoreload 2

In [1]:

import os

data_path = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\sequencingData"

folder_path = os.path.join(data_path, '300824_ran0_NC_PhuRV140824')
folder_path = os.path.join(data_path, r"IDT_source_150524_4thTry_75ntV2_sample5x_reag2x")
folder_path = os.path.join(data_path, r"v2_041024_negCtrl_ss75v2_200fmol")

In [2]:

fastq_file_paths = []
readme_file_path = ""

for file_name in os.listdir(folder_path):
    
    if file_name.endswith('fastq'):
        fastq_file_paths.append(os.path.join(folder_path, file_name))
    else:
        readme_file_path = os.path.join(folder_path, file_name)

In [3]:
fastq_file_paths

['C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\sequencingData\\v2_041024_negCtrl_ss75v2_200fmol\\AWB850_pass_11481c67_b3ddb0cc_0.fastq',
 'C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\sequencingData\\v2_041024_negCtrl_ss75v2_200fmol\\AWB850_pass_11481c67_b3ddb0cc_1.fastq',
 'C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\sequencingData\\v2_041024_negCtrl_ss75v2_200fmol\\AWB850_pass_11481c67_b3ddb0cc_2.fastq',
 'C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\sequencingData\\v2_041024_negCtrl_ss75v2_200fmol\\AWB850_pass_11481c67_b3ddb0cc_3.fastq',
 'C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\sequencingData\\v2_041024_negCtrl_ss75v2_200fmol\\AWB850_pass_11481c67_b3ddb0cc_4.fastq',
 'C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\sequencingData\\v2_041024_negCtrl_ss75v2_200fmol\\AWB850_pass_11481c67_b3ddb0cc_5.fastq',
 'C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\sequencingData\\v2_041024_negCtrl_ss75v2_200fmol\\AWB850_pass

In [4]:

def extract_reference_from_readme(readme_file_path):

    original_strand = ""
    with open(readme_file_path, 'r') as f:
        original_strand = f.readlines()
        print(original_strand)
        original_strand = original_strand[4]
        print(original_strand)
        
    return original_strand.strip()

In [5]:
from utils import read_strands_from_file_alt

def get_sequenced_strands_from_fastqs(fastq_file_paths):
    sequenced_strands = []

    for fastq_file_path in fastq_file_paths:

        try:
            sequenced_strands_local = read_strands_from_file_alt(fastq_file_path)
            sequenced_strands.extend(sequenced_strands_local)
        except Exception as e:
            print(e)

    return sequenced_strands
    

In [6]:
original_strand = extract_reference_from_readme(readme_file_path=readme_file_path)

sequenced_strands = get_sequenced_strands_from_fastqs(fastq_file_paths=fastq_file_paths)

['041024_negCtrl_ss75v2_200fmol\n', 'file:041024_negCtrl_ss75v2_200fmol\n', 'Underlying sequence:\n', '>75ntV2\n', 'AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG\n', 'Aligning sequence: CACTTTGGCGTGACGG (reverse complementary)\n', 'Source of the DNA:\n', 'Kilobaser, This sample comes from Kilobaser_sample of 75nt\n', 'Quantity of DNA: 200 fmol single-stranded\n', '\n', 'This corresponds to the folder named 041024_negCtrl_ss75v2_200fmol except the source DNA is from Kilobaser\n', 'This files containing the reads that come from the nanopore output using 200 fmol of DNA\n']
AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG



In [7]:

from sequencing import NaiveSequencingModel

def get_consensus_strand(original_strand, sequenced_strands, alignment=False):

    sequencing_model = NaiveSequencingModel(strand_length=len(original_strand))
    consensus_strand = sequencing_model.consensus_decoding(sequenced_strands=sequenced_strands, original_strand=original_strand, alignment=alignment)

    return consensus_strand


consensus_strand = get_consensus_strand(original_strand=original_strand, sequenced_strands=sequenced_strands)

In [21]:
original_strand

'AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG'

In [None]:
from decoding import weighted_aligned_cluster
from utils import get_recovery_percentage

decoded_strand = weighted_aligned_cluster(sequenced_strands[:100], len(original_strand), n_clusters=3)
get_recovery_percentage(decoded_strand, original_strand)

[0 0 0 0 0 2 0 0 0 2 2 0 2 2 0 2 0 2 2 2 0 2 0 2 2 0 0 2 0 0 2 0 0 0 0 0 0
 0 2 2 2 0 0 0 0 0 0 2 2 0 0 2 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 2 2 0 0 0
 0 2 0 0 0 2 1 0 2 0 2 0 2 0 2 0 1 0 2 0 2 0 0 0 0 0]
What is going on?
[['TGTGTAGGTCTACTTGGTTCAGTTCGTATTTGCTACCATGACCACCGTGATGGTGGCGAGGCTTTCTTATGGTAGCGCAGGCGGTGGCGGCGGTCCGTGTAGGTGCCAGCGATTGCCTCGCCATTTGGTCGTCTTCCACCAGCCGTGGTCGCGCCAGTGACCTCCATCCCAGTAATGGCCACAGTATCGGAATCGCCTCTGTAATTTTACTGACGGGACTAACCTAATTTCCGCAGCCTCTGCTGCCATGGGGACGAGTTAGGAAGAGCGGGGAAAGTGCGAGTACGATAAGTACATCTTTGGCGGGACCTCCTTTTCTACCTCATTACCATTATCGGTGGATCTCCGGAGATTCTCGATGAGAACAGCAAG', 'TGTCCATCTATTTCGTTCAGTTACGTATTGTTCTGAGAGCCCGGGGCAATTGTCGCTTGGCGGAGATACCGTCATAAGTGCCGGGTTCGTTGGCGATCAGATGCAGGCGAGTCTCATACCGGCCATGGCATAATCTGGCTACCCAGACGGAATGAAGGAGTTCATCGCGGAGTTGTGGAGGTCACTTTGAAGTACACCGGAGTGTTCGCCGGGAAAGCGATTTCATTGGTAGCAATGCCCTGTTCCGGTAAGTGAAGAACCATTTCCAGTCCATGGAAACCACTTCGATGGTAATGGGCTTCTCGTCGTGTGCGAGCAGCTTGCTAGGCTCAAGAGCGTAGAGTGGTTTTAGGTACGTCTTCTGCAAGGAAGATGATGATTAAGGATGGGTACCA', '

In [None]:
from decoding import aligned_consensus

decoded_strand = aligned_consensus(sequenced_strands[:10], strand_length=len(original_strand))
get_recovery_percentage(decoded_strand, original_strand)

In [8]:
consensus_strand

0.2

In [9]:
original_strand

'AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG'

In [10]:

from utils import get_recovery_percentage

def evaluate_consensus_strand(consensus_strand, original_strand):
    return get_recovery_percentage(consensus_strand=consensus_strand, original_strand=original_strand)

In [12]:

from random import sample

sample_sizes = [i * 0.1 * len(sequenced_strands) for i in range(1, 11)]
sampled_sequenced_strands = [sample(sequenced_strands, int(sample_size)) for sample_size in sample_sizes]


In [15]:

def get_lowest_recovery_strands(sequenced_strands):

    n_strands = len(sequenced_strands)
    step_size = 200
    while(n_strands > 0):
        sampled_sequenced_strand = sample(sequenced_strands, n_strands)
        
        recovery_percentage = evaluate_consensus_strand(get_consensus_strand(original_strand=original_strand, sequenced_strands=sampled_sequenced_strand), original_strand=original_strand)

        if recovery_percentage == 1.0:
            print(n_strands)
            n_strands -= step_size

        else: 
            return n_strands, recovery_percentage
        
    return n_strands


lowest_recovery_strands, recovery_percentage = get_lowest_recovery_strands(sequenced_strands=sequenced_strands)
        

print(lowest_recovery_strands, recovery_percentage)

6016 0.2


In [145]:


import numpy as np

sample_ranges = np.arange(0, 30, 2)
sampled_sequenced_strands = [sample(sequenced_strands, sample_size) for sample_size in sample_ranges]

percentage_recoveries = []

for sampled_sequenced_strand in sampled_sequenced_strands:
    
    recovery_percentage = evaluate_consensus_strand(get_consensus_strand(original_strand=original_strand, sequenced_strands=sampled_sequenced_strand), original_strand=original_strand)
    print(recovery_percentage)
    percentage_recoveries.append(recovery_percentage)



0.21333333333333335
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
