In [7]:
%load_ext autoreload
%autoreload 2

In [30]:
import utils
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random
import numpy as np
import heirarchal_clustering
from Levenshtein import ratio

In [8]:
# Loading the original strands
original_strands = utils.read_fasta_data(fasta_filepath=r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\raptor_data\cat.jpg_RU10.fasta")[0]
reads = utils.get_fastq_records(fastq_filepath=r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\raptor_data\reads.fastq")
original_strand_length = len(original_strands[0])

In [14]:
len(reads)

1954567

In [25]:
seqs = [str(i.seq) for i in reads]
sampled = random.sample(seqs, 10000)

In [27]:
clusters, centroids = heirarchal_clustering.cluster_trivial(sampled, use_centroids=False)

Total strands 10000


10000it [03:44, 44.63it/s]


In [43]:
clusters2, centroids2 = heirarchal_clustering.cluster_trivial(sampled, use_centroids=False)

Total strands 10000


3047it [00:31, 97.72it/s] 


KeyboardInterrupt: 

In [35]:


def get_best_candidates_and_recoveries(original_strands, candidates):
    """
    For a given set of strands, finds the best candidates and returns a dictionary with
    the recoveries, the number of fully recovered strands, the best set of candidates and the
    original strands
    """
    
    fully_recovered_strands = 0
    recoveries = []
    partially_recovered_recoveries = []
    best_candidates = []

    for strand in tqdm(original_strands):
        if strand in candidates:
            fully_recovered_strands += 1
            recoveries.append(1.0)
            best_candidates.append(strand)
        else:
            best_recovery_within_candidates = 0.0
            best_candidate = ""
            for candidate in candidates:
                strand_recovery = ratio(candidate, strand)

                if strand_recovery > best_recovery_within_candidates:
                    best_recovery_within_candidates = strand_recovery
                    best_candidate = candidate

            recoveries.append(best_recovery_within_candidates)
            partially_recovered_recoveries.append(best_recovery_within_candidates)
            best_candidates.append(best_candidate)

    return {
        "recoveries": recoveries,
        "fully_recovered_strands": fully_recovered_strands,
        "partially_recovered_recoveries": partially_recovered_recoveries,
        "best_candidates": best_candidates,
        "original_strands": original_strands
    }


In [36]:
recoveries = get_best_candidates_and_recoveries(original_strands, centroids)
print(recoveries['partially_recovered_recoveries'])

  0%|          | 0/770 [00:00<?, ?it/s]

[0.675, 0.6844660194174756, 0.6901763224181361, 0.6618357487922706, 0.6780487804878048, 0.6783919597989949, 0.6765432098765432, 0.6779661016949152, 0.681704260651629, 0.6802030456852792, 0.6828087167070218, 0.6813725490196079, 0.6733167082294265, 0.6921241050119331, 0.6940874035989717, 0.6834170854271358, 0.6836734693877551, 0.6783042394014962, 0.685, 0.689156626506024, 0.684596577017115, 0.6715686274509804, 0.6768447837150127, 0.6894865525672371, 0.6801007556675063, 0.6864197530864198, 0.6847290640394088, 0.6780487804878048, 0.6682808716707022, 0.6847290640394088, 0.6785714285714286, 0.69, 0.6829268292682926, 0.6733668341708543, 0.6938775510204082, 0.6798029556650247, 0.6831683168316831, 0.6862745098039216, 0.6778042959427208, 0.6763285024154589, 0.6732673267326732, 0.6826923076923077, 0.6812652068126521, 0.6814814814814815, 0.9618320610687023, 0.6865671641791045, 0.6751918158567776, 0.684596577017115, 0.6810551558752997, 0.9086538461538461, 0.6798029556650247, 0.6813725490196079, 0.6

In [44]:
clusters.sort(key=len, reverse=True)

In [46]:
clustered_seqs = [[seqs[i] for i in j] for j in clusters]

In [None]:
for ind, seqs in enumerate(clustered_seqs):
    try:
        guess = heirarchal_clustering.make_prediction(cluster=seqs, sample_size=len(seqs) - 1)
    except:
        continue
    
    best_rec = 0.0
    for k, i in enumerate(original_strands):
        rec = ratio(i, guess)
        if rec > best_rec:
            best_rec = rec
            ref = k
    
    print(f"{k} {best_rec} recovered by {ind}")

769 0.6855791962174941 recovered by 0
769 0.6855791962174941 recovered by 1
769 0.6855791962174941 recovered by 2
Command '['C:\\Users\\Parv\\Doc\\RA\\Projects\\incomplete_cycles\\muscle-windows-v5.2.exe', '-align', 'clm.fasta', '-output', 'clmout.fasta']' returned non-zero exit status 1.
769 0.6855791962174941 recovered by 4
769 0.6855791962174941 recovered by 5
769 0.689156626506024 recovered by 6
769 0.689156626506024 recovered by 7
769 0.689156626506024 recovered by 8
769 0.689156626506024 recovered by 9
769 0.689156626506024 recovered by 10
769 0.689156626506024 recovered by 11
769 0.689156626506024 recovered by 12
769 0.689156626506024 recovered by 13
769 0.689156626506024 recovered by 14
769 0.689156626506024 recovered by 15
769 0.689156626506024 recovered by 16
769 0.6909090909090909 recovered by 17
769 0.6909090909090909 recovered by 18
769 0.6909090909090909 recovered by 19
769 0.6909090909090909 recovered by 20
769 0.6909090909090909 recovered by 21
769 0.6909090909090909 re