### Clustering using Edit Distance
Trying to implement strand isolation using purely Edit Distance. Want to select the same strands together after signatures using some similarity metric

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import numpy as np
import aligned_clustering
from utils import get_fastq_records, load_json_file, get_original_strands, get_badread_strand_id, get_recovery_percentage, create_random_strand, len_histogram, get_sort_by_sublists_length
import Levenshtein
import random
from Levenshtein import ratio, distance
from collections import Counter
from seq_stat import align
import matplotlib.pyplot as plt
import heirarchal_clustering
import strand_filtering
from tqdm.notebook import tqdm
import utils

In [7]:
#records_original = get_fastq_records(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\reads_no_adapters.fastq")
records = utils.postprocess_badread_sequencing_data(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\reads_adapters.fastq\reads_adapters.fastq", reverse_oriented=False)

296it [00:00, 12781.59it/s]


In [8]:
original_strand_ids, coupling_rates, capping_flags, original_strands = get_original_strands(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\original_strands.txt")

strand_ids_synthesized = load_json_file(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\synthesized_uid_reference.json")

In [9]:
records = [i for i in records if get_badread_strand_id(i) in strand_ids_synthesized]
#records = filter_junk_reads(records)
seqs = [str(i.seq) for i in records]

In [10]:

# ID reference functions

def get_badread_strand_id(record):
    return record.description.split()[1].split(',')[0]

def get_strand_reference(strand_record, strand_index=True):

    strand_id = get_badread_strand_id(strand_record)

    if strand_id in strand_ids_synthesized:
        if strand_index:
            return original_strand_ids.index(strand_ids_synthesized[strand_id])
        else: 
            return original_strands[original_strand_ids.index(strand_ids_synthesized[strand_id])]
    print("Invalid Strand ID!")
    return None

In [11]:
def get_edit_distance_matrix(strands):
    """
    Returns the edit distance matrix for the strands
    O(n^2)
    """

    n_strands = len(strands)
    edit_distance_matrix = np.zeros([n_strands, n_strands])
    for i in range(n_strands - 1):
        for j in range(i + 1, n_strands):
            edit_distance_matrix[i,j] = edit_distance_matrix[j, i] = ratio(strands[i], strands[j])

    return edit_distance_matrix

def calculate_centroid(strands):
    edit_distance_matrix = get_edit_distance_matrix(strands)

    distances = [sum(edit_distance_matrix[i, :]) for i in range(len(edit_distance_matrix))]
    return strands[distances.index(min(distances))]


def get_mean_edit_distance_cluster(edit_distance_matrix):
    distances = [sum(edit_distance_matrix[i, :]) for i in range(len(edit_distance_matrix))]
    return np.mean(distances)


In [12]:
sample_statistics = utils.get_sample_statistics(records, original_strands, distance_threshold=40)

100%|██████████| 287/287 [00:00<00:00, 13375.76it/s]


In [13]:
sample_statistics

{'distance_threshold': 40,
 'strands_by_index': array([46., 68., 50., 55., 57.]),
 'n_straight': 141,
 'n_reverse': 135,
 'unmatched': 11}

In [14]:

# To catch the adapters
clusters_index, clustered_seqs, centroids = heirarchal_clustering.cluster_trivial(seqs, use_centroids=False, analysis=False, distance_threshold=40)
#clustered_seqs = [[seqs[i] for i in j] for j in clusters]

Total strands 287


100%|██████████| 287/287 [00:00<00:00, 17900.39it/s]

Number of clusters = 12





In [15]:
indices = get_sort_by_sublists_length(clusters_index)
sorted_clusters = [clusters_index[i] for i in indices]
sorted_centroids = [centroids[i] for i in indices]
sorted_clustered_seqs = [clustered_seqs[i] for i in indices]

centroids = sorted_centroids
clusters = sorted_clusters
clustered_seqs = sorted_clustered_seqs

In [16]:
[len(i) for i in clustered_seqs]

[59, 54, 51, 47, 36, 16, 15, 5, 1, 1, 1, 1]

Levenshtien edit distance works pretty well. That's my metric. Kmeans works for now. I'll bring down junk reads etc, remove adapters, and test it again to see how well aggregation works.
Maybe try DBscan with Levenshtien distance

In [17]:
# And then merge and validate for top 5

original_strand_guessed_best = []

for ind, i in enumerate(centroids):
    
    best_guessed = 0
    best_rec = 0.0
    for k, original_strand in enumerate(original_strands):
        rec = ratio(i, original_strand)

        if rec > best_rec:
            best_guessed = k
            best_rec = rec
        
        rec = ratio(utils.reverse_complement(i), original_strand)

        if rec > best_rec:
            best_guessed = k
            best_rec = rec
        

    original_strand_guessed_best.append(best_guessed)
    



    print(f"{best_guessed} {best_rec} recovered by {ind}".format())
    #print(len(i))
    #guess = heirarchal_clustering.make_prediction(clustered_seqs[ind], sample_size=10)
    #print(ratio(guess, original_strands[best_guessed]))
    #print(get_recovery_percentage(guess, original_strands[best_guessed]))
    #print()


    print(f"{len(clusters[ind])} elements in the cluster")
    #print(f"{strand_filtering.sum_entropies(i)} long run")
    #print(f"{np.mean([len(i) for i in clustered_seqs[ind]])} mean length of strand in cluster")
    #print(f"{np.std([len(i) for i in clustered_seqs[ind]])} mean std of strand in cluster")
    #print(f"{get_mean_edit_distance_cluster(distance_matrices[ind])} mean edit distance within cluster")
    print()

4 0.9586374695863747 recovered by 0
59 elements in the cluster

1 0.9406175771971497 recovered by 1
54 elements in the cluster

3 0.9582309582309583 recovered by 2
51 elements in the cluster

0 0.9408983451536643 recovered by 3
47 elements in the cluster

2 0.9304556354916067 recovered by 4
36 elements in the cluster

1 0.9311163895486936 recovered by 5
16 elements in the cluster

2 0.9245283018867925 recovered by 6
15 elements in the cluster

3 0.9245283018867925 recovered by 7
5 elements in the cluster

1 0.6299212598425197 recovered by 8
1 elements in the cluster

0 0.8312236286919832 recovered by 9
1 elements in the cluster

0 0.718562874251497 recovered by 10
1 elements in the cluster

4 0.5201465201465201 recovered by 11
1 elements in the cluster



In [18]:
# And then merge and validate for top 5

original_strand_guessed_best = []
guesses = []

for i in range(len(clustered_seqs)):
    if len(clustered_seqs) > 15:
        guess = heirarchal_clustering.make_prediction(clustered_seqs[i], sample_size=15)
    else:
        guess = heirarchal_clustering.make_prediction(clustered_seqs[i], sample_size=len(clustered_seqs[i]))
    
    best_guessed = 0
    best_rec = 0.0
    for k, original_strand in enumerate(original_strands):
        #rec = get_recovery_percentage(guess, original_strand)
        rec = align(guess, original_strand)

        if rec > best_rec:
            best_guessed = k
            best_rec = rec
            aligned = align(guess, original_strand, identity=False)

    guesses.append(guess)
    original_strand_guessed_best.append(best_guessed)
    print(f"{best_guessed} {best_rec} recovered by {i}".format())
    print(aligned.format())
    print(len(guess))
    print()

4 1.0 recovered by 0
target           11 TTTCAGTCTGTGCGAGTGACAGATCAATCCCACCCCGCAGGTTCCTTAGTCAGATGTAAA
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 TTTCAGTCTGTGCGAGTGACAGATCAATCCCACCCCGCAGGTTCCTTAGTCAGATGTAAA

target           71 TACGACCCGGGTGTGAGTACTGTACTGAGTACTCAGTGGAACCCTCTGGAGAGGGAGCTT
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TACGACCCGGGTGTGAGTACTGTACTGAGTACTCAGTGGAACCCTCTGGAGAGGGAGCTT

target          131 GCACATTGAAACAAATCATTCTTCACCCATTACGGAGTCCAGGCGCTCGTAGCTGGCACG
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 GCACATTGAAACAAATCATTCTTCACCCATTACGGAGTCCAGGCGCTCGTAGCTGGCACG

target          191 CTTGCCGTAATAAGCGAACA 211
                180 |||||||||||||||||||| 200
query           180 CTTGCCGTAATAAGCGAACA 200

226

3 0.5110294117647058 recovered by 1
target            1 TCGT--TCAGTTACG--TA-TTGC--TATATCAC-C-AT-----AGGTC-

### Adapter trimming

In [19]:
original_strand_guessed_best

[4, 3, 3, 0, 0, 1, 2, 4, 1, 0, 4, 1]

In [87]:

def get_kmers(seq, k):
    return [seq[i:i+k] for i in range(len(seq) - k + 1)]
    

def remove_adapter(subseq, adapter):
    """
    Given the strand, remove the largest direct match of the starting adapter.
    """
    
    kmer_length = len(subseq)

    while kmer_length >=3:
        kmers = get_kmers(subseq, kmer_length)

        if any([i for i in kmers if i in adapter]):
            for ind, i in enumerate(kmers):
                if i in adapter:    
                    print(f"Length {kmer_length} adapter found at position {ind}")
                    return ind+kmer_length

        kmer_length -= 1

    return -1


def undress_strand(seq, starting_adapter, ending_adapter, len_original, original_strand=None):
    """
    """

    overhang = len(seq) - len_original
    print(f"Overhang is {overhang}")
    starting_subseq = seq[:overhang]
    first_index = remove_adapter(starting_subseq, starting_adapter)
    seq_ = seq[first_index:]


    if original_strand:
        rec = get_recovery_percentage(seq_, original_strand)
        if rec == 1.0:
            print("Found and recovered")
            return True
        else:
            print(align(seq_, original_strand))
            return False
    
            
           


In [79]:
picked_strand = original_strands[4]
guess = guesses[0]

In [80]:
# Can use forward and reverse adapter to figure out orientation as well

start_adapter = "AATGTACTTCGTTCAGTTACGTATTGCT" 
end_adapter = "GCAATACGTAACTGAACGAAGT"

In [88]:

for i, j in zip(guesses[:5], original_strand_guessed_best):
    original_strand = original_strands[j]
    if undress_strand(i, start_adapter, end_adapter, 200, original_strand):
        print()
        continue
    undress_strand(utils.reverse_complement(i), start_adapter, end_adapter, 200, original_strand)
    print()

Overhang is 26
Length 11 adapter found at position 0
Found and recovered

Overhang is 21
Length 21 adapter found at position 0
0.49236641221374045
Overhang is 21
Length 3 adapter found at position 2
0.4854014598540146

Overhang is 20
Length 18 adapter found at position 0
Found and recovered

Overhang is 27
Length 14 adapter found at position 0
Found and recovered

Overhang is 21
Length 18 adapter found at position 0
0.4684014869888476
Overhang is 21
Length 3 adapter found at position 1
0.4855072463768116



In [37]:
remove_adapter(guess[-14:], end_adapter)

Length 13 adapter found at position 0


(0, 13)

In [48]:
guess = guess[11:-15]

In [49]:
guess

'TTTCAGTCTGTGCGAGTGACAGATCAATCCCACCCCGCAGGTTCCTTAGTCAGATGTAAATACGACCCGGGTGTGAGTACTGTACTGAGTACTCAGTGGAACCCTCTGGAGAGGGAGCTTGCACATTGAAACAAATCATTCTTCACCCATTACGGAGTCCAGGCGCTCGTAGCTGGCACGCTTGCCGTAATAAGCGAACA'

In [50]:
picked_strand

'TTTCAGTCTGTGCGAGTGACAGATCAATCCCACCCCGCAGGTTCCTTAGTCAGATGTAAATACGACCCGGGTGTGAGTACTGTACTGAGTACTCAGTGGAACCCTCTGGAGAGGGAGCTTGCACATTGAAACAAATCATTCTTCACCCATTACGGAGTCCAGGCGCTCGTAGCTGGCACGCTTGCCGTAATAAGCGAACA'

In [51]:
get_recovery_percentage(guess, picked_strand)

1.0

In [None]:
len(start_adapter)

28

In [None]:
len(end_adapter)

22

In [None]:
for i in seqs:
    print(align(i[:10], start_adapter))
    print(align(i[:10], end_adapter))
    print()

0.47619047619047616
0.3888888888888889

1.0
0.5

0.34615384615384615
0.391304347826087

0.625
0.4444444444444444

0.375
0.75

1.0
0.7272727272727273

1.0
0.42105263157894735

0.4090909090909091
0.391304347826087

0.5882352941176471
0.36363636363636365

1.0
0.4444444444444444

0.625
0.4444444444444444

0.6
0.3157894736842105

0.38095238095238093
0.34782608695652173

0.5882352941176471
0.38095238095238093

1.0
0.4090909090909091

0.5
0.5294117647058824

1.0
0.5

0.625
0.4444444444444444

0.4090909090909091
0.391304347826087

0.4090909090909091
0.391304347826087

0.375
0.75

1.0
0.5

0.375
0.75

0.4
0.4444444444444444

0.5
0.5294117647058824

0.375
0.75

1.0
0.5625

0.9
0.3888888888888889

0.375
0.75

1.0
0.4090909090909091

0.6428571428571429
0.5

0.45454545454545453
0.6363636363636364

0.34782608695652173
0.47058823529411764

0.375
0.75

0.4090909090909091
0.391304347826087

0.6
0.5

0.375
0.75

0.5555555555555556
0.5384615384615384

0.6
0.3684210526315789

1.0
0.5

0.5555555555555556
0