In [None]:
!pip install Bio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from Bio)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, Bio
Successfully installed Bio-1.5.9 biopython-1.81 bio

In [None]:
from Bio import SeqIO
from sklearn.cluster import KMeans
import numpy as np
import warnings


In [None]:
sequences = []
max_seq_length = 0

In [None]:
for record in SeqIO.parse('/content/out_1.fasta', 'fasta'):
    sequence = str(record.seq)
    sequences.append(sequence)
    seq_length = len(sequence)
    if seq_length > max_seq_length:
        max_seq_length = seq_length

In [None]:
n_sequences = len(sequences)
nucleotides = ['A', 'C', 'G', 'T']
nucleotide_map = {nucleotide: index for index, nucleotide in enumerate(nucleotides)}
encoded_sequences = np.full((n_sequences, max_seq_length), -1, dtype=int)
for i, sequence in enumerate(sequences):
    seq_length = len(sequence)
    for j, nucleotide in enumerate(sequence):
        index = nucleotide_map.get(nucleotide, -1)
        encoded_sequences[i, j] = index

In [None]:
n_clusters = 500
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(encoded_sequences)

In [None]:
clusters = {i: [] for i in range(n_clusters)}
for i, label in enumerate(cluster_labels):
    clusters[label].append(sequences[i])


In [None]:
for cluster_id, sequences in clusters.items():
    cluster_size = len(sequences)
    print(f'Cluster {cluster_id + 1}:')
    for sequence in sequences:
        print(sequence)
    print(f'Sequence Count: {cluster_size}\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
GCTGGTTCTCCTCGAAAGCTATTTAGGTAGCGCCTCATGTACCACTGTAGGGGGTAGAGCACT
GCTGGTTCTCCTCGAAAGCTATTTAGGTAGCGCCTCATGTATCACTGTAGGGGGTAGAGCACT
GCTGGTTCTCCTCGAAAGCTATTTAGGTAGCGCCTCATGTATCACTGTAGGGGGTAG
Sequence Count: 12

Cluster 137:
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGCAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGCAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGCAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGCAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGTAG
TTCTTGACGACCATAGAGCATTGGAACCACCTGATCCCATCCCGAACTCAGCAG
T