In [1]:
%load_ext autoreload
%autoreload 2

In [65]:
import utils
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random
import numpy as np
import heirarchal_clustering
from Levenshtein import ratio, distance
import difflib

In [4]:
# Loading the original strands
original_strands, ids = utils.read_synthesized_strands_from_file(file_path=r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\raptor_data\cat.jpg_RU10 2.fasta")

In [5]:
# Loading the original strands
reads = utils.postprocess_badread_sequencing_data(fastq_filepath=r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\raptor_data\reads.fastq")

1954567it [02:06, 15461.81it/s]


In [12]:
sampled = random.sample(reads, 10000)

In [56]:
sample_stats = utils.get_sample_statistics(records=sampled, original_strands=original_strands, original_strand_ids=ids, distance_threshold=40, reference=True)

100%|██████████| 10000/10000 [00:01<00:00, 8244.53it/s]


In [57]:
sample_stats

{'distance_threshold': 40,
 'strands_by_index': array([ 8., 14.,  8., ..., 12.,  8.,  9.]),
 'mean_strands_per_index': 9.27788844621514,
 'std_strands_per_index': 2.9829310439656096,
 'unique_matches': 1003,
 'n_straight': 4547,
 'n_reverse': 4768,
 'unmatched': 471}

In [20]:
seqs_sampled = [str(i.seq) for i in sampled]


In [22]:
cluster_inds, cluster_strands, centroids = heirarchal_clustering.cluster_trivial(seqs_sampled, use_centroids=False, distance_threshold=40)

Total strands 10000


100%|██████████| 10000/10000 [11:27<00:00, 14.54it/s]

Number of clusters = 2181





In [25]:
indices = utils.get_sort_by_sublists_length(cluster_inds)

sorted_clusters = [cluster_inds[i] for i in indices]
sorted_centroids = [centroids[i] for i in indices]
sorted_clustered_seqs = [cluster_strands[i] for i in indices]

centroids = sorted_centroids
clusters = sorted_clusters
clustered_seqs = sorted_clustered_seqs

In [31]:


def get_best_candidates_and_recoveries(original_strands, candidates):
    """
    For a given set of strands, finds the best candidates and returns a dictionary with
    the recoveries, the number of fully recovered strands, the best set of candidates and the
    original strands
    """
    
    fully_recovered_strands = 0
    recoveries = []
    partially_recovered_recoveries = []
    best_candidates = []

    for strand in tqdm(original_strands):
        if strand in candidates:
            fully_recovered_strands += 1
            recoveries.append(1.0)
            best_candidates.append(strand)
        else:
            best_recovery_within_candidates = 0.0
            best_candidate = ""
            for candidate in candidates:
                rev_candidate = utils.reverse_complement(candidate)
                strand_recovery = ratio(candidate, strand)

                if strand_recovery > best_recovery_within_candidates:
                    best_recovery_within_candidates = strand_recovery
                    best_candidate = candidate

                strand_recovery = ratio(rev_candidate, strand)

                if strand_recovery > best_recovery_within_candidates:
                    best_recovery_within_candidates = strand_recovery
                    best_candidate = candidate

            recoveries.append(best_recovery_within_candidates)
            partially_recovered_recoveries.append(best_recovery_within_candidates)
            best_candidates.append(best_candidate)

    return {
        "recoveries": recoveries,
        "fully_recovered_strands": fully_recovered_strands,
        "partially_recovered_recoveries": partially_recovered_recoveries,
        "best_candidates": best_candidates,
        "original_strands": original_strands
    }


def check_clusters(original_strands, cluster_inds):

    for i in cluster_inds:
        record = reads[i[0]]
        try:
            print(ratio(record, original_strands[ids.index(utils.get_badread_strand_id(record))]))
            print(len(i))
            print()
        except:
            continue


In [None]:
np.mean(recoveries['recoveries'][:1004])

np.float64(0.888769849476563)

In [49]:
len(original_strands)

1004

In [60]:
centroid_indices = [i[0] for i in cluster_inds]
centroid_records = [sampled[j] for j in centroid_indices][:1004]

centroid_stats = utils.get_sample_statistics(records = centroid_records, original_strands=original_strands, original_strand_ids=ids, reference=True, distance_threshold=40)

100%|██████████| 1004/1004 [00:00<00:00, 24248.58it/s]


In [61]:
centroid_stats

{'distance_threshold': 40,
 'strands_by_index': array([1., 1., 1., ..., 1., 1., 1.]),
 'mean_strands_per_index': 0.9800796812749004,
 'std_strands_per_index': 0.3860123979098825,
 'unique_matches': 919,
 'n_straight': 444,
 'n_reverse': 540,
 'unmatched': 20}

In [63]:
guesses = []
for i in tqdm(range(1000)):
    guesses.append(heirarchal_clustering.make_prediction(clustered_seqs[i], 5))

  0%|          | 0/1000 [00:00<?, ?it/s]

ValueError: Sample larger than population or is negative

In [None]:

distance_threshold = 40

strands_by_index = np.zeros(len(original_strands))
straight_strands = 0
reverse_strands = 0

for ind, i in enumerate(guesses):

    revseq = utils.reverse_complement(i)

    record = sampled[cluster_inds[ind][0]]

    strand_id = utils.get_badread_strand_id(record)
    #synthesized_id = strand_ids_synthesized[strand_id]
    #index = original_strand_ids.index(synthesized_id)
    index = ids.index(strand_id)
    strand = original_strands[index]

    if distance(i, strand) <= distance_threshold:
        strands_by_index[index] += 1
        straight_strands += 1
        found_flag = True
        print(f"{index} found by {ind}")

    elif distance(revseq, strand) <= distance_threshold:
        strands_by_index[index] += 1
        reverse_strands += 1
        found_flag = True
        print(f"{index} found by r{ind}")

833 found by 0
392 found by r1
197 found by r2
147 found by r4
372 found by r7
933 found by 8
945 found by r9
896 found by 10
876 found by r11
322 found by 12
468 found by 13
705 found by r14
777 found by 15
492 found by 16
669 found by r17
901 found by 18
388 found by r19
217 found by 20
476 found by 36
446 found by 45
304 found by 46
544 found by 47
46 found by 48
187 found by r49
971 found by 50
639 found by r51
814 found by 52
161 found by r53
265 found by r54
843 found by r55
327 found by r56
564 found by 57
838 found by r58
683 found by 59
520 found by r60
359 found by r61
723 found by 62
882 found by r63
425 found by 155
754 found by r156
34 found by r157
956 found by 158
365 found by r159
608 found by r160
986 found by r161
212 found by r162
10 found by r163
94 found by 164
988 found by r165
373 found by 371
306 found by r618
936 found by 619
310 found by r620
214 found by r621
743 found by 622
70 found by r623
637 found by r624
847 found by r625
281 found by 626
676 found by 6

In [77]:
sum(strands_by_index)

np.float64(75.0)

In [78]:
reverse_strands

43