### Clustering using Edit Distance
Trying to implement strand isolation using purely Edit Distance. Want to select the same strands together after signatures using some similarity metric

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import numpy as np
from aligned_clustering import conduct_align_clustering
from utils import get_fastq_records, load_json_file, get_original_strands, get_badread_strand_id, get_recovery_percentage, create_random_strand
import Levenshtein
import random
from Levenshtein import ratio, distance
from collections import Counter
from heirarchal_clustering import filter_junk_reads
from seq_stat import align
import matplotlib.pyplot as plt
import heirarchal_clustering

In [5]:
#records_original = get_fastq_records(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\reads_no_adapters.fastq")
records_original = get_fastq_records(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\reads_adapters.fastq\reads_adapters.fastq")

In [6]:
original_strand_ids, coupling_rates, capping_flags, original_strands = get_original_strands(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\original_strands.txt")

strand_ids_synthesized = load_json_file(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\synthesized_uid_reference.json")

In [11]:
records = [i for i in records_original if get_badread_strand_id(i) in strand_ids_synthesized]
#records = filter_junk_reads(records)
seqs = [str(i.seq) for i in records]

In [30]:

# ID reference functions

def get_badread_strand_id(record):
    return record.description.split()[1].split(',')[0]

def get_strand_reference(strand_record, strand_index=True):

    strand_id = get_badread_strand_id(strand_record)

    if strand_id in strand_ids_synthesized:
        if strand_index:
            return original_strand_ids.index(strand_ids_synthesized[strand_id])
        else: 
            return original_strands[original_strand_ids.index(strand_ids_synthesized[strand_id])]
    print("Invalid Strand ID!")
    return None

In [125]:
def get_edit_distance_matrix(strands):
    """
    Returns the edit distance matrix for the strands
    O(n^2)
    """

    n_strands = len(strands)
    edit_distance_matrix = np.zeros([n_strands, n_strands])
    for i in range(n_strands - 1):
        for j in range(i + 1, n_strands):
            edit_distance_matrix[i,j] = edit_distance_matrix[j, i] = ratio(strands[i], strands[j])

    return edit_distance_matrix

def calculate_centroid(strands):
    edit_distance_matrix = get_edit_distance_matrix(strands)

    distances = [sum(edit_distance_matrix[i, :]) for i in range(len(edit_distance_matrix))]
    return strands[distances.index(min(distances))]


def get_mean_edit_distance_cluster(edit_distance_matrix):
    distances = [sum(edit_distance_matrix[i, :]) for i in range(len(edit_distance_matrix))]
    return np.mean(distances)


In [134]:

# To catch the adapters
clusters, centroids, distance_matrices = heirarchal_clustering.cluster_trivial(seqs, similarity_threshold=0.75, use_centroids=True, analysis=True)
clustered_seqs = [[str(records[i].seq) for i in j] for j in clusters]

287it [00:00, 392.78it/s]


Levenshtien edit distance works pretty well. That's my metric. Kmeans works for now. I'll bring down junk reads etc, remove adapters, and test it again to see how well aggregation works.
Maybe try DBscan with Levenshtien distance

In [146]:
# And then merge and validate for top 5

original_strand_guessed_best = []

for ind, i in enumerate(centroids):
    
    best_guessed = 0
    best_rec = 0.0
    for k, original_strand in enumerate(original_strands):
        rec = ratio(i, original_strand)

        if rec > best_rec:
            best_guessed = k
            best_rec = rec

    original_strand_guessed_best.append(best_guessed)
    if best_rec > 0.9:
        print(align(i, original_strands[best_guessed], identity=False))
        print(i)
        print(original_strands[best_guessed])
    #print(f"{best_guessed} {best_rec} recovered by {ind}".format())
    #print(len(i))

    #print(f"{len(clusters[ind])} elements in the cluster")
    #print(f"{np.mean([len(i) for i in clustered_seqs[ind]])} mean length of strand in cluster")
    #print(f"{np.std([len(i) for i in clustered_seqs[ind]])} mean std of strand in cluster")
    #print(f"{get_mean_edit_distance_cluster(distance_matrices[ind])} mean edit distance within cluster")
    print()

target           25 TTTCAGTCTGTGCGAGTGACAGATCAATCCCA-CCCGCAGGTTCCTTAGTCAGATGTAAA
                  0 ||||||||||||||||||||||||||||||||-|||||||||||||||||||||||||||
query             0 TTTCAGTCTGTGCGAGTGACAGATCAATCCCACCCCGCAGGTTCCTTAGTCAGATGTAAA

target           84 TACGACCCGGGTGTGAGTACTGTACTGAGTACTCAGTGGAACCCTCTGGAGAGGGAGCTT
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TACGACCCGGGTGTGAGTACTGTACTGAGTACTCAGTGGAACCCTCTGGAGAGGGAGCTT

target          144 GCACATTGAAACAAATCATTCTTCACCCATTACGGAGTCCAGGCGCTCGTAGCTGGGCAC
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||-|||||
query           120 GCACATTGAAACAAATCATTCTTCACCCATTACGGAGTCCAGGCGCTCGTAGCT-GGCAC

target          204 GCTTGCCGTAATAAGCGAACA 225
                180 ||||||||||||||||||||| 201
query           179 GCTTGCCGTAATAAGCGAACA 200

ATGTACTTCGTTCAGTTACGTATTCTTTCAGTCTGTGCGAGTGACAGATCAATCCCACCCGCAGGTTCCTTAGTCAGATGTAAATACGACCCGGGTGTGAGTACTGTACTGAGTACTCAGTGGAACCCT

In [109]:
clusters_2 = heirarchal_clustering.cluster_trivial(centroids, similarity_threshold=0.6)

13it [00:00, 1182.65it/s]


In [136]:
clustered_seqs.sort(key=len, reverse=True)


In [137]:
# And then merge and validate for top 5

original_strand_guessed_best = []

for i in range(len(clustered_seqs)):
    if len(clustered_seqs) > 15:
        guess = heirarchal_clustering.make_prediction(clustered_seqs[i], sample_size=15)
    else:
        guess = heirarchal_clustering.make_prediction(clustered_seqs[i], sample_size=len(clustered_seqs[i]))
    
    best_guessed = 0
    best_rec = 0.0
    for k, original_strand in enumerate(original_strands):
        rec = get_recovery_percentage(guess, original_strand)
        rec = align(guess, original_strand)

        if rec > best_rec:
            best_guessed = k
            best_rec = rec

    original_strand_guessed_best.append(best_guessed)
    print(f"{best_guessed} {best_rec} recovered by {i}".format())
    print()

3 0.5110294117647058 recovered by 0

3 1.0 recovered by 1

1 1.0 recovered by 2

0 0.5243445692883895 recovered by 3

4 1.0 recovered by 4

2 1.0 recovered by 5

0 0.5183823529411765 recovered by 6

0 0.4856115107913669 recovered by 7

0 1.0 recovered by 8

4 0.5054545454545455 recovered by 9

1 0.9900497512437811 recovered by 10

4 0.46875 recovered by 11

1 0.3701657458563536 recovered by 12



In [79]:
# Needs further testing