## Testing recovery using heirarchal clustering on barcoded sequencing data 
https://github.com/ImperialCollegeLondon/sequencingData/tree/main/200125%20-%20Full-Circle%20C000121

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import utils
import random
import heirarchal_clustering
import kmeans_clustering
import numpy as np
from collections import Counter
from clustering import kmerDNA
from Levenshtein import ratio
from sklearn.preprocessing import normalize
import aligned_clustering
import cluster_merging
import matplotlib.pyplot as plt

In [12]:
def get_edit_distance_matrix(strands):
    """
    Returns the edit distance matrix for the strands
    O(n^2)
    """
    n_strands = len(strands)
    edit_distance_matrix = np.zeros([n_strands, n_strands])
    for i in range(n_strands - 1):
        for j in range(i + 1, n_strands):
            edit_distance_matrix[i,j] = edit_distance_matrix[j, i] = ratio(strands[i], strands[j])

    return edit_distance_matrix

def calculate_centroid(strands: list[str]):
    edit_distance_matrix = get_edit_distance_matrix(strands)

    distances = [sum(edit_distance_matrix[i, :]) for i in range(len(edit_distance_matrix))]
    return strands[distances.index(min(distances))]

## Loading the fastq files

In [4]:
example_fastq_filepath = r"C:\Users\Parv\Doc\RA\Projects\sequencingData\200125 - Full-Circle C000121\Sample1\Sample1.fastq\Sample1.fastq"

sequenced_strands = utils.get_fastq_records(fastq_filepath=example_fastq_filepath)

In [5]:
fastq_filepaths_common_string = r"C:\Users\Parv\Doc\RA\Projects\sequencingData\200125 - Full-Circle C000121\Sample{a}\Sample{a}.fastq\Sample{a}.fastq"
fastq_filepaths = [fastq_filepaths_common_string.format(a=i) for i in range(1, 7)]
fastq_filepaths = [fastq_filepaths[0], fastq_filepaths[2], fastq_filepaths[5]]

In [6]:
sequenced_strands_arr = [utils.get_fastq_records(fastq_filepath=i) for i in fastq_filepaths]

In [7]:

# Loading original strands from the readme
# Selecting only the 1st, 3rd and 5th sample

original_strands = [
    "AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG",
    "TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGC",
    "AGTGCAACAAGTCAATCCGTTTCCCCAAGGAGGCCTCCTGGAACAATGAATTATGGCGCCAAGACATGGGGGATCCTAACTGGGGCGCCGACCTGGAGAAACGATCCGGAGGTGCCAGGATCGTCTCTGGAACGCTCCGAAAGTCTTGTT"
]

# Length filtering
sequenced_strands_arr = [[i for i in j if len(i) > len(original_strands[k]) - 10 and len(i) < len(original_strands[k]) + 10] for k, j in enumerate(sequenced_strands_arr)]

strand_pool = [x for j in sequenced_strands_arr for x in j]
random.shuffle(strand_pool)

In [8]:
sequenced_strands = [str(i.seq) for i in sequenced_strands]

In [9]:
import cProfile
import re

clusters, centroids = heirarchal_clustering.cluster_trivial(sequenced_strands, similarity_threshold=0.95, use_centroids=False)

Total strands 14051


14051it [00:02, 6181.39it/s]


Levenshtien distance just may not be fast enough to scale to all of these reads. I might need to figure out how to do automata. Hmm, or do kmeans?

In [10]:

clustered_seqs = [[str(strand_pool[ind].seq) for ind in j] for j in clusters]

In [15]:

original_strand_guessed_best = []
counter = 0

for ind, i in enumerate(centroids):
    
    best_guessed = 0
    best_rec = 0.0
    for k, original_strand in enumerate(original_strands):
        rec = ratio(i, original_strand)

        if rec > best_rec:
            best_guessed = k
            best_rec = rec

    original_strand_guessed_best.append(best_guessed)
    if len(clusters[ind]) > 30:
        print(f"{best_guessed} {best_rec} recovered by {ind}".format())

        centroid = calculate_centroid(clustered_seqs[ind])
        print(ratio(original_strands[best_guessed], centroid))
        #print(len(i))
        print(f"{len(clusters[ind])} elements in the cluster")
        print(f"{np.mean([len(i) for i in clustered_seqs[ind]])} mean length of strand in cluster")
        print(f"{np.std([len(i) for i in clustered_seqs[ind]])} mean std of strand in cluster")
        counter += 1
        print()

print(counter)

0 0.6622516556291391 recovered by 0
0.6164383561643836
3222 elements in the cluster
126.26070763500931 mean length of strand in cluster
31.161983463888493 mean std of strand in cluster

1 0.6403940886699507 recovered by 1
0.6134969325153374
111 elements in the cluster
126.21621621621621 mean length of strand in cluster
31.657060298834786 mean std of strand in cluster

0 0.9866666666666667 recovered by 3
0.6620689655172414
3347 elements in the cluster
126.90319689273977 mean length of strand in cluster
30.912990433313 mean std of strand in cluster

0 0.7246376811594203 recovered by 4
0.9536423841059603
32 elements in the cluster
127.9375 mean length of strand in cluster
30.168213300591734 mean std of strand in cluster

0 0.7978723404255319 recovered by 5
0.9801324503311258
123 elements in the cluster
123.32520325203252 mean length of strand in cluster
31.604130617898036 mean std of strand in cluster

0 0.7226890756302521 recovered by 6
0.6622516556291391
119 elements in the cluster
125.

In [432]:
for k, guess in enumerate(guesses):
    for i, original_strand in enumerate(original_strands):
        rec = utils.get_recovery_percentage(guess, original_strand)
        print("{} is a guess for {} with {}".format(k, i, rec))


0 is a guess for 0 with 0.21333333333333335
0 is a guess for 1 with 1.0
0 is a guess for 2 with 0.12
1 is a guess for 0 with 0.28
1 is a guess for 1 with 0.35555555555555557
1 is a guess for 2 with 0.12666666666666668
2 is a guess for 0 with 0.26666666666666666
2 is a guess for 1 with 0.18888888888888888
2 is a guess for 2 with 1.0
3 is a guess for 0 with 0.32
3 is a guess for 1 with 0.24444444444444444
3 is a guess for 2 with 0.22666666666666666
4 is a guess for 0 with 1.0
4 is a guess for 1 with 0.17777777777777778
4 is a guess for 2 with 0.13333333333333333


In [433]:
guess_clusters = heirarchal_clustering.cluster_trivial(guesses, similarity_threshold=0.9)

5it [00:00, ?it/s]


In [434]:
guess_clusters

{'TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGCA': [0],
 'GCCAGTTGAGCGCCGGTCCGTCAGCGCTCACGTCATATCGTCGTATAGCTACTGTATAGCAGATGAGTGTCGCCGCGGCTCCACTTTCGAA': [1],
 'AGTGCAACAAGTCAATCCGTTTCCCCAAGGAGGCCTCCTGGAACAATGAATTATGGCGCCAAGACATGGGGGATCCTAACTGGGGCGCCGACCTGGAGAAACGATCCGGAGGTGCCAGGATCGTCTCTGGAACGCTCCGAAAGTCTTGTTA': [2],
 'AACAAGACTTTCGGAGCGTTCCAGAGACGATCCTGGCACCTCCGGATCGTTTCTCCAGGTCGGCGCCCCAGTTAGGATCCCCCATGTCTTGGCGCCATAATTCATTGTTCCAGGAGGCCTCCTTGGGGAAACGGATTGACTTGTTGCACTA': [3],
 'AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTGA': [4]}