## Testing recovery using heirarchal clustering on barcoded sequencing data 
https://github.com/ImperialCollegeLondon/sequencingData/tree/main/200125%20-%20Full-Circle%20C000121

In [1]:
%load_ext autoreload
%autoreload 2

In [257]:
import utils
import random
import heirarchal_clustering
import kmeans_clustering
import numpy as np
from collections import Counter
from clustering import kmerDNA
from Levenshtein import ratio
from sklearn.preprocessing import normalize
import aligned_clustering
import cluster_merging

## Loading the fastq files

In [3]:
example_fastq_filepath = r"C:\Users\Parv\Doc\RA\Projects\sequencingData\200125 - Full-Circle C000121\Sample1\Sample1.fastq\Sample1.fastq"

sequenced_strands = utils.get_fastq_records(fastq_filepath=example_fastq_filepath)

In [141]:
fastq_filepaths_common_string = r"C:\Users\Parv\Doc\RA\Projects\sequencingData\200125 - Full-Circle C000121\Sample{a}\Sample{a}.fastq\Sample{a}.fastq"
fastq_filepaths = [fastq_filepaths_common_string.format(a=i) for i in range(1, 7)]
fastq_filepaths = [fastq_filepaths[0], fastq_filepaths[2], fastq_filepaths[5]]

In [185]:
sequenced_strands_arr = [utils.get_fastq_records(fastq_filepath=i) for i in fastq_filepaths]

In [219]:
sequenced_strands_arr[2] = random.sample(sequenced_strands_arr[2], 15000)

In [221]:

# Loading original strands from the readme
# Selecting only the 1st, 3rd and 5th sample

original_strands = [
    "AGTGTCTGTGACCAGTACGACCCAGTACCGTCACGGTTAGGAAGCTCCTCGCTTCTTAGCCGTCACGCCAAAGTG",
    "TCGAAAGTGGAGCCGCGGCGACACTCATCTGCTATACAGTAGCTATACGACGATATGACGTGAGCGCTGACGGACCGGCGCTCAACTGGC",
    "AGTGCAACAAGTCAATCCGTTTCCCCAAGGAGGCCTCCTGGAACAATGAATTATGGCGCCAAGACATGGGGGATCCTAACTGGGGCGCCGACCTGGAGAAACGATCCGGAGGTGCCAGGATCGTCTCTGGAACGCTCCGAAAGTCTTGTT"
]

strand_pool = [x for j in sequenced_strands_arr for x in j]
#random.shuffle(strand_pool)

Levenshtien distance just may not be fast enough to scale to all of these reads. I might need to figure out how to do automata. Hmm, or do kmeans?

In [222]:
freq_counts = kmeans_clustering.get_kmer_frequency_counts(strand_pool)

47419it [00:42, 1121.20it/s]


In [223]:

normalised_freq_counts = kmeans_clustering.normalise_frequency_counts(freq_counts)

In [224]:
n_clusters = 3
clusters = kmeans_clustering.cluster(normalised_freq_counts, n_clusters)

In [227]:
# Seperate out into clusters
# Need some metric to evaluate clustering - maybe with barcodes
labels = list(clusters.labels_)

clustered_seq_ids = [[strand_pool[ind].description.split()[8] for ind, cluster in enumerate(labels) if cluster==j] for j in range(n_clusters)]
clustered_seqs = [[strand_pool[ind] for ind, cluster in enumerate(labels) if cluster==j] for j in range(n_clusters)]

In [226]:
[Counter(i) for i in clustered_seqs]

[Counter({'barcode=barcode27': 18064,
          'barcode=barcode25': 10409,
          'barcode=barcode30': 845}),
 Counter({'barcode=barcode30': 14061,
          'barcode=barcode25': 15,
          'barcode=barcode27': 2}),
 Counter({'barcode=barcode25': 3627,
          'barcode=barcode27': 302,
          'barcode=barcode30': 94})]

In [246]:
utils.align(str(clustered_seqs[1][6].seq), original_strands[2], identity=True)

0.45933014354066987

In [269]:

# from each cluster sample randomly 15 and then make 3 guesses

for i in range(10):
    sampled_seqs = random.sample(clustered_seqs[1], 5)
    muscled = aligned_clustering.multiple_alignment_muscle([str(i.seq) for i in sampled_seqs])
    consensus_strand = cluster_merging.majority_merge(muscled)
    print(utils.align(consensus_strand, original_strands[2], identity=True))
    print(utils.get_recovery_percentage(consensus_strand, original_strands[2]))
    print()

1.0
1.0

0.9933774834437086
0.9933333333333333

0.4897959183673469
0.22

0.4948453608247423
0.22666666666666666

0.5204081632653061
0.44666666666666666

0.6304347826086957
0.54

0.4948453608247423
0.22666666666666666

1.0
1.0

0.9867549668874173
0.8066666666666666

1.0
1.0



In [274]:
heirarchal_clustering.filter_junk_reads(clustered_seqs[0][:10000])

# Its that O(n2) that is going to kill me

  0%|          | 2/10000 [00:02<3:54:49,  1.41s/it]


KeyboardInterrupt: 