## Testing clustering on the best sequenced data

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import os
from Bio import SeqIO
from seq_stat import align
import pandas as pd
import numpy as np
from aligned_clustering import conduct_align_clustering
import matplotlib.pyplot as plt
import random
from utils import get_recovery_percentage

In [2]:
folder_path = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\sequencingData"
sekected_file = "290824_75ntv2_NC_PhuRV140824"
selected_filepath = os.path.join(folder_path, sekected_file)
fastq_filepath = os.path.join(selected_filepath, 'fastq_runid_f3e5a38afcf7007e36b0348f7058ffe3a2b51fdd_0_0.fastq')
readme_filepath = os.path.join(selected_filepath, "readme.txt")

In [27]:

def parse_biopython(input_fastq):
    for record in SeqIO.parse(input_fastq, 'fastq'):
        yield record

def process_fastq(fastq_filepath):
    sequenced_strands = []
    for i, record in enumerate(parse_biopython(fastq_filepath)):
        try:
            sequenced_strands.append(str(record.seq))
        except:
            continue

    return sequenced_strands

def collect_strands_from_fastq_folder(folder_path):
    sequenced_strands = []
    for file in os.listdir(folder_path):
        if file.endswith('fastq'):
            sequenced_strands.extend(process_fastq(os.path.join(folder_path, file)))
    return sequenced_strands

def get_original_strand(readme_filepath, line_location=1):
    with open(readme_filepath, 'r') as f:
        lines = f.readlines()
    return str(lines[line_location]).replace('\n', '')

def cluster(sequenced_strands, original_strand):
    return conduct_align_clustering(
                trimmed_seqs=sequenced_strands,
                original_strand=original_strand,
                best_recovery=True
            )

def filter_by_length(sequenced_strands, lb, ub):
    return [i for i in sequenced_strands if len(i) < ub and len(i) > lb]

def evaluate_candidates(original_strand, candidates, analysis_flag=False):
    """Evaluates candidates after fixing the starting position """

    best_recovery = 0.0
    post_alignment_recoveries = []
    aligned_candidates = []

    for candidate in candidates:
        aligned, identity, indices = align(seqA=original_strand, seqB=candidate)
        # Find the first non missing value indice that gets aligned.
        aligned_candidate = candidate[indices[1][0]:]
        recovery = get_recovery_percentage(
            consensus_strand=aligned_candidate, original_strand=original_strand)
        
        aligned_candidates.append(aligned_candidate)
        post_alignment_recoveries.append(recovery)

        if recovery > best_recovery:
            best_recovery = recovery

    if analysis_flag:
        return post_alignment_recoveries, aligned_candidates
    
    return best_recovery


In [5]:
sequenced_strands = collect_strands_from_fastq_folder(folder_path=selected_filepath)
original_strand = get_original_strand(readme_filepath=readme_filepath)

In [6]:
conduct_align_clustering(
    trimmed_seqs=sequenced_strands,
    original_strand=original_strand,
    best_recovery=True
)

0.44

So aligned clustering definitely needs post-processing on the sequenced data

### Postprocessing

In [7]:
# What if we select only those strands between 60 - 100 bases?

sequenced_strands_length_filtered = [i for i in sequenced_strands if len(i) < 150 and len(i) > 100]
print(len(sequenced_strands_length_filtered)/len(sequenced_strands))
print(len(sequenced_strands_length_filtered))

0.7094501455046714
4632


In [8]:
print(cluster(sequenced_strands=sequenced_strands_length_filtered, original_strand=original_strand))

0.4266666666666667


By limiting to 70 % of the pool, we are still getting the same results. Let's try further filtering using alignment

In [9]:

for strand in sequenced_strands_length_filtered:
    aligned, identity, indices = align(seqA=original_strand, seqB=strand)
    if identity > 0.8:
        print(identity)
        print(indices)
    #print(aligned)
    #print(identity)
    #print()

1.0
[[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
   72  73  74]
 [ 32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49
   50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67
   68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
   86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103
  104 105 106]]
0.8571428571428571
[[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  -1  69  -1
   -1  -1  -1  70  71  72  -1  -1  -1  -1  73  74]
 [ 31  

In [10]:
seq_strands_filtered_2 = [i[30:] for i in sequenced_strands_length_filtered]
print(cluster(sequenced_strands=seq_strands_filtered_2, original_strand=original_strand))

1.0


If we know what we are looking for, can't we do alignment? Why cluster it. So seems like matching the starting and ending seems to improve clustering a lot hmm. Could I have done it without any a priori? How else would I clean the data to naturally give me what I want?
Protect small piece of information gmm

In [11]:
recoveries = conduct_align_clustering(
    trimmed_seqs = sequenced_strands,
    original_strand=original_strand
)

In [14]:
candidates = recoveries['candidates']

for candidate in candidates:
    aligned, identity, indices = align(seqA=original_strand, seqB=candidate)

Okay so now we know that the length changes don't really affect our ability to get the correct strand, we just need to align the final candidate since it gets rid of all the IDS errors in between, so its just shifted. However, if we don't know what we are looking for, making a guess on the best candidate start and end is impossible. However, that's why we have ECC! Okay so now we are getting somewhere. Let us try a harder dataset

## Dataset 2

In [40]:
original_strand = 'TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGC'
sequenced_strands = collect_strands_from_fastq_folder(folder_path=r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\sequencingData\300824_ran0_NC_PhuRV140824")

In [41]:
seq_strands_len_filtered = [i for i in sequenced_strands if len(i) < 150 and len(i) > 80]
print(len(seq_strands_len_filtered))

63196


In [46]:

if len(seq_strands_len_filtered) > 5000:
    recoveries = conduct_align_clustering(
        trimmed_seqs=random.sample(seq_strands_len_filtered, 10000),
        original_strand=original_strand
    )
else:
    recoveries = conduct_align_clustering(
        trimmed_seqs=seq_strands_len_filtered,
        original_strand=original_strand
    )

print(max(recoveries['recoveries']))

0.35555555555555557


In [47]:
evaluate_candidates(
    original_strand=original_strand,
    candidates=recoveries['candidates']
)

0.9888888888888889

### Testing protected information on part of strand

In [40]:
protected_strand = original_strand[11:41]

In [41]:
for candidate in recoveries['candidates']:
    aligned, identity, indices = align(seqA=protected_strand, seqB=candidate)
    if identity > 0.8:
        print(aligned)
        print(identity)
        print(indices)
        print()

GCCGCGGCGACACTCATCTGCTAGACAGTA
0.9666666666666667
[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  24 25 26 27 28 29]
 [46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
  70 71 72 73 74 75]]

GCCGCGGCGACACTCATCTGCTATACAGTA
0.9090909090909091
[[ 0  1  2  3 -1 -1 -1  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
  21 22 23 24 25 26 27 28 29]
 [46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
  70 71 72 73 74 75 76 77 78]]

GCCGCGGCGACACTCATCTGCTATACAGTA
1.0
[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  24 25 26 27 28 29]
 [48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
  72 73 74 75 76 77]]

GCCGCGCCGACACTCATCTGCTATACAGTA
0.8787878787878788
[[ 0  1  2  3  4  5  6  7  8  9 10 11 -1 -1 -1 12 13 14 15 16 17 18 19 20
  21 22 23 24 25 26 27 28 29]
 [43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
  67 68 69 70 71 72 73 74 75]]

GCCGCGGCGAC

How big the protected strand would need to be would be to figure out random errors, but yeah - this works really well. I don't even need those many strands, only to figure out the alignment error. Question is, does it work with random strands, can I distinguish well? Okay well, we have moved forward. Next steps are to be thought later. I wonder how well it works if we make a massive strand pool with all the strands of app length. 

## Capping sequencing experiment

In [33]:
selected_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\sequencingData\310524_75V2_OK_RVcompl"
readme_filepath = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\sequencingData\310524_75V2_OK_RVcompl\readme.txt"


sequenced_strands = collect_strands_from_fastq_folder(folder_path=selected_filepath)

In [13]:
original_strand = get_original_strand(readme_filepath=readme_filepath, line_location=1).strip()

In [37]:
sequenced_strands = filter_by_length(sequenced_strands=sequenced_strands, lb=100, ub=150)
sequenced_strands_sampled = random.sample(sequenced_strands, 5000)

In [38]:
recoveries = conduct_align_clustering(
    trimmed_seqs=sequenced_strands,
    original_strand=original_strand
)

In [39]:
evaluate_candidates(
    original_strand=original_strand,
    candidates=recoveries['candidates'],
    analysis_flag=False
)

1.0

## Creating strand pool

In [48]:
original_strands = []  # Ordered
sequenced_strands = []
sequenced_strands_per_experiment = []  # Ordered as per original_strands

In [78]:
original_strands_by_experiment = {
    "150524_4thTry_75ntV2_sample5x_reag2x": "AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG",
    "290824_75ntv2_NC_PhuRV140824": "AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG",
    "300824_ran0_NC_PhuRV140824": "TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGC",
    "310524_75V2_OK_RVcompl": "AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG",
    "IDT_source_150524_4thTry_75ntV2_sample5x_reag2x": "TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGC",
    "v2_031024_negControl_ssran0-IDT_200fmol": "TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGC",
    "v2_041024_negCtrl_ss75v2_200fmol": "AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG",
    "v2_160523_75ntV2_2ndTry_200fmolT_300fmolN9": "AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG",
    "v2_170524_ran0IDT_200fmol_300fmol_polI_N9": "TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGC"
}

original_strands = list(original_strands_by_experiment.values())

In [76]:

path_to_experiment_data = r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\sequencingData"
strand_pool = []
sequenced_strands_by_experiment = {}

for file in os.listdir(path_to_experiment_data):
    if file == '.git':
        print("skipped file")
        continue
    
    sequenced_strands = collect_strands_from_fastq_folder(os.path.join(path_to_experiment_data, file))
    sequenced_strands_by_experiment[file] = sequenced_strands
    strand_pool.extend(sequenced_strands)

skipped file


In [118]:
strand_pool_filtered = filter_by_length(sequenced_strands=strand_pool, lb=100, ub=180)
strand_pool_sampled = random.sample(strand_pool_filtered, 2000)

In [119]:
recoveries = conduct_align_clustering(
    trimmed_seqs=strand_pool_sampled,
    original_strand=original_strands
)

In [120]:
for original_strand in original_strands:
    print(evaluate_candidates(original_strand=original_strand, candidates=recoveries['candidates']))

0.3466666666666667
0.3466666666666667
0.4
0.3466666666666667
0.4
0.4
0.3466666666666667
0.3466666666666667
0.4


In [93]:
print(list(original_strands_by_experiment.keys()))

['150524_4thTry_75ntV2_sample5x_reag2x', '290824_75ntv2_NC_PhuRV140824', '300824_ran0_NC_PhuRV140824', '310524_75V2_OK_RVcompl', 'IDT_source_150524_4thTry_75ntV2_sample5x_reag2x', 'v2_031024_negControl_ssran0-IDT_200fmol', 'v2_041024_negCtrl_ss75v2_200fmol', 'v2_160523_75ntV2_2ndTry_200fmolT_300fmolN9', 'v2_170524_ran0IDT_200fmol_300fmol_polI_N9']


In [96]:
strands_per_experiment = [len(i) for i in sequenced_strands_by_experiment.values()]

In [97]:
strands_per_experiment

[26475, 6529, 66461, 38869, 3840, 22914, 6016, 20712, 3840]