### Clustering using Hash functions
Trying to implement strand isolation using purely Hash functions. Want to select the same strands together after signatures using some similarity metric

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import numpy as np
from aligned_clustering import conduct_align_clustering
from utils import get_fastq_records, load_json_file, get_original_strands, get_badread_strand_id, get_recovery_percentage

In [16]:
records_original = get_fastq_records(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\reads_no_adapters.fastq")
records_original = get_fastq_records(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\reads_0.98.fastq")

In [17]:
original_strand_ids, coupling_rates, capping_flags, original_strands = get_original_strands(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\original_strands.txt")

strand_ids_synthesized = load_json_file(r"C:\Users\Parv\Doc\RA\Projects\incomplete_cycles\v2\runs\2025-01-30 21.21.00.463318\synthesized_uid_reference.json")

In [18]:
records = [i for i in records_original if get_badread_strand_id(i) in strand_ids_synthesized]
sequenced_strands = [str(i.seq) for i in records]
strand_ids_sequenced = [get_badread_strand_id(i) for i in records]

So we now have a bunch of strands, and their corresponding original strands. Let's turn them all into kmers

In [19]:
def kmerDNA(seq, k=3):
    kmer = []
    kmer_representation = np.zeros(64)
    for ell in range(len(seq) - k + 1):
        nstr = seq[ell: ell+k]
        index = 0
        for j, c in enumerate(nstr):
            if c == 'A':
                i = 0
            elif c == 'C':
                i = 1
            elif c == 'G':
                i = 2
            elif c == 'T':
                i = 3
            else:
                index = -1
                break
            index += i*(4**j)
        kmer_representation[index] += 1

    return kmer_representation

In [20]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.feature_extraction.text import TfidfTransformer
from collections import Counter

data = [kmerDNA(i) for i in sequenced_strands]
data = normalize(data, norm='l1', axis=1)


tfidf_transformer = TfidfTransformer()
data = tfidf_transformer.fit_transform(data).toarray()
print(len(data))
#scaler = StandardScaler()
#z_score_norm = scaler.fit_transform(data)
#inertias = []

# elbow is where the inertia becomes linear
n_clusters = 10
kmeans = KMeans(n_clusters)
res = kmeans.fit(data)
clusters = [[] for i in range(n_clusters)]
strand_clusters = [[] for i in range(n_clusters)]

for ind, cluster in enumerate(res.labels_):
    clusters[cluster].append(strand_ids_synthesized[strand_ids_sequenced[ind]])
    strand_clusters[cluster].append(sequenced_strands[ind])

[Counter(i) for i in clusters]

324


[Counter({'8206e19d-8964-464e-b094-7e3925dcf643': 26}),
 Counter({'8206e19d-8964-464e-b094-7e3925dcf643': 36}),
 Counter({'8bcdba4b-8e73-4909-9d48-1ed937b81bde': 46,
          '3580071c-3bf0-4bb3-9e92-a981fa95db68': 1}),
 Counter({'5fa790b2-b889-4e80-9a10-630305f75b0e': 37}),
 Counter({'5fa790b2-b889-4e80-9a10-630305f75b0e': 29,
          '8206e19d-8964-464e-b094-7e3925dcf643': 1}),
 Counter({'d9567e5e-43d3-4869-946e-5832193a26ca': 31}),
 Counter({'3580071c-3bf0-4bb3-9e92-a981fa95db68': 36,
          '8206e19d-8964-464e-b094-7e3925dcf643': 1}),
 Counter({'8bcdba4b-8e73-4909-9d48-1ed937b81bde': 32}),
 Counter({'d9567e5e-43d3-4869-946e-5832193a26ca': 24}),
 Counter({'3580071c-3bf0-4bb3-9e92-a981fa95db68': 24})]

In [21]:
original_strand_ids

['d9567e5e-43d3-4869-946e-5832193a26ca',
 '5fa790b2-b889-4e80-9a10-630305f75b0e',
 '8bcdba4b-8e73-4909-9d48-1ed937b81bde',
 '3580071c-3bf0-4bb3-9e92-a981fa95db68',
 '8206e19d-8964-464e-b094-7e3925dcf643']

In [22]:
# Try on larger data pool - some metric to quantify uniqueness in each cluster
# And then we need to merge somehow
from aligned_clustering import multiple_alignment_muscle
from cluster_merging import majority_merge
from utils import get_recovery_percentage

original_strand = original_strands[2]
cluster_strands = strand_clusters[1]

In [23]:
# Let us try aggregation first. Select one cluster and use Muscle and voting
original_strand

'TGACAACTGAAATTCCAGCGAAGCGCGGGCATGACATAGCACCCGAACCCATAAATTCTGCTACCACAGTATCGTAGAACAACTAAATCGGTGACGTGCTTATAGCGACCGCGTACACGCCTCATGGCCACACGCGAGCCCAGAGCGAGTCACGTAATTTACGGTCCCTAAGTTTGTTGTTTTTACCAGCAGGTACAATC'

In [24]:
print(f"Baseline voting percentage {get_recovery_percentage(majority_merge(strand_clusters[0]), original_strand)}")

Baseline voting percentage 0.245


In [25]:
for i in cluster_strands:
    print(align(i, original_strand).counts[0])

NameError: name 'align' is not defined

In [26]:
import random

"""
for i in range(20):
    strand_picks = random.sample(cluster_strands, 15)
    muscled = multiple_alignment_muscle(strand_picks)
    aligned_consensus = majority_merge(muscled)
    print(get_recovery_percentage(aligned_consensus, original_strand))
"""

muscled = multiple_alignment_muscle(cluster_strands)
aligned_consensus = majority_merge(muscled)
print(get_recovery_percentage(aligned_consensus, original_strand))

0.23


In [27]:
import Levenshtein

def count_ids_errors(str1, str2):
    edit_operations = Levenshtein.editops(str1, str2)
    
    insertions = sum(1 for op in edit_operations if op[0] == 'insert')
    deletions = sum(1 for op in edit_operations if op[0] == 'delete')
    substitutions = sum(1 for op in edit_operations if op[0] == 'replace')

    return {'Insertions': insertions, 'Deletions': deletions, 'Substitutions': substitutions}

identities = []
mismatches = []

for i in cluster_strands:
    print(count_ids_errors(i, original_strand))

{'Insertions': 20, 'Deletions': 28, 'Substitutions': 63}
{'Insertions': 16, 'Deletions': 28, 'Substitutions': 62}
{'Insertions': 15, 'Deletions': 54, 'Substitutions': 50}
{'Insertions': 30, 'Deletions': 33, 'Substitutions': 47}
{'Insertions': 30, 'Deletions': 28, 'Substitutions': 51}
{'Insertions': 16, 'Deletions': 36, 'Substitutions': 59}
{'Insertions': 13, 'Deletions': 38, 'Substitutions': 65}
{'Insertions': 24, 'Deletions': 25, 'Substitutions': 58}
{'Insertions': 18, 'Deletions': 31, 'Substitutions': 61}
{'Insertions': 16, 'Deletions': 39, 'Substitutions': 56}
{'Insertions': 24, 'Deletions': 30, 'Substitutions': 53}
{'Insertions': 15, 'Deletions': 41, 'Substitutions': 59}
{'Insertions': 14, 'Deletions': 30, 'Substitutions': 69}
{'Insertions': 29, 'Deletions': 31, 'Substitutions': 49}
{'Insertions': 20, 'Deletions': 30, 'Substitutions': 60}
{'Insertions': 24, 'Deletions': 33, 'Substitutions': 54}
{'Insertions': 20, 'Deletions': 34, 'Substitutions': 57}
{'Insertions': 23, 'Deletions':

In [196]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.26.1-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Downloading levenshtein-0.26.1-cp312-cp312-win_amd64.whl (98 kB)
   ---------------------------------------- 0.0/98.8 kB ? eta -:--:--
   ------------ --------------------------- 30.7/98.8 kB 640.0 kB/s eta 0:00:01
   ---------------------------------------- 98.8/98.8 kB 1.4 MB/s eta 0:00:00
Installing collected packages: Levenshtein
Successfully installed Levenshtein-0.26.1


DEPRECATION: Loading egg at c:\users\parv\anaconda3\envs\pytorch_gpu\lib\site-packages\ont_bonito-0.8.1-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [31]:

insertions = []
deletions = []
subs = []

for i, j in zip(sequenced_strands, strand_ids_sequenced):
    original_strand = original_strands[original_strand_ids.index(strand_ids_synthesized[j])]
    print(get_recovery_percentage(i, original_strand))

    errs = count_ids_errors(i, original_strand)
    print(errs)
    insertions.append(errs['Insertions'])
    deletions.append(errs['Deletions'])
    subs.append(errs['Substitutions'])
    print()

0.305
{'Insertions': 19, 'Deletions': 42, 'Substitutions': 44}

0.855
{'Insertions': 1, 'Deletions': 0, 'Substitutions': 3}

0.305
{'Insertions': 25, 'Deletions': 44, 'Substitutions': 48}

0.265
{'Insertions': 19, 'Deletions': 27, 'Substitutions': 2}

0.215
{'Insertions': 2, 'Deletions': 35, 'Substitutions': 2}

0.27
{'Insertions': 16, 'Deletions': 39, 'Substitutions': 61}

0.23
{'Insertions': 3, 'Deletions': 10, 'Substitutions': 4}

0.22
{'Insertions': 24, 'Deletions': 35, 'Substitutions': 44}

0.2
{'Insertions': 1, 'Deletions': 11, 'Substitutions': 1}

0.925
{'Insertions': 1, 'Deletions': 2, 'Substitutions': 1}

0.175
{'Insertions': 18, 'Deletions': 42, 'Substitutions': 55}

0.275
{'Insertions': 13, 'Deletions': 26, 'Substitutions': 68}

0.265
{'Insertions': 2, 'Deletions': 32, 'Substitutions': 5}

0.24
{'Insertions': 24, 'Deletions': 36, 'Substitutions': 45}

0.26
{'Insertions': 10, 'Deletions': 44, 'Substitutions': 67}

0.285
{'Insertions': 16, 'Deletions': 21, 'Substitutions': 70}

In [41]:
# Maybe if I turn adapters off?
print(np.mean(deletions)/len(original_strand) * 100)
print(np.mean(insertions)/len(original_strand) * 100)
print(np.mean(subs)/len(original_strand) * 100)

13.836419753086417
5.521604938271605
14.5679012345679


Levenshtien edit distance works pretty well. That's my metric. Kmeans works for now. I'll bring down junk reads etc, remove adapters, and test it again to see how well aggregation works.