In [1]:
import pandas as pd 
import numpy as np 
import evoVAE.utils.metrics as mt 
import evoVAE.utils.seq_tools as st
from numba import njit, prange, jit
import random



# GB1 clustering 

In [None]:
aln: pd.DataFrame = pd.read_pickle("/Users/sebs_mac/uni_OneDrive/honours/data/gb1/encoded_weighted/gb1_ancestors_extants_encoded_weighted_no_dupes.pkl")
aln = aln.drop_duplicates(subset=['sequence'])
#aln = aln.sample(frac=0.2)
aln.drop(columns=["encoding", "weights"], inplace=True)
print(aln.shape)
aln.head()

msa, seq_key, key_label = st.convert_msa_numpy_array(aln)
msa.shape

In [None]:
st.write_fasta_file("gb1_ancestors_extants_no_dupes.fasta", aln)

In [None]:
results = pd.read_csv("gb1_an_ex_cluster.tsv", sep="\t", header=None)
results.columns = ["cluster", "sequence"]
mark_ancestors = lambda x: 1 if "tree" in x else 0
is_ancestor = results["sequence"].apply(mark_ancestors)
results["is_ancestor"] = is_ancestor

results

In [None]:
representative_ids = results["cluster"].unique()
representative_ids.shape

In [None]:
clusters = [results.loc[results["cluster"] == cluster] for cluster in representative_ids]

In [None]:
SAMPLE_SIZE = 7000
extant_proportions = [0.22, 0.17, 0.12, 0.07, 0.02, 0.0]


random.seed(42)

for p in extant_proportions:
    for r in range(1, 6):
        sample_ids = st.sample_extant_ancestors(clusters, SAMPLE_SIZE, extant_proportion=p)
        sample_seqs = aln.loc[aln["id"].isin(sample_ids)]
        st.write_fasta_file(f"./clusters/gb1_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta", sample_seqs)

In [None]:
for p in extant_proportions:
    for r in range(1, 6):
        samp_aln = st.read_aln_file(f"./clusters/gb1_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta")
        numpy_aln, _, _ = st.convert_msa_numpy_array(samp_aln)
        weights = st.reweight_by_seq_similarity(numpy_aln, 0.2)
        samp_aln["weights"] = weights
        samp_aln.to_pickle(f"./clusters/gb1_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}_encoded_weighted.pkl")


In [None]:
test = pd.read_pickle("/Users/sebs_mac/uni_OneDrive/honours/data/gb1/mmseqs_clustering/replicate_encoded_weighted/gb1_ancestors_extants_no_dupes_clustered_r1_extant_0.12_encoded_weighted.pkl")
test.loc[~test["id"].str.contains("tree")]

# A4 human clustering 

In [25]:
clustering_results = pd.read_csv("/Users/sebs_mac/uni_OneDrive/honours/data/a4_human/mmseqs_clustering/a4_an_ex_cluster.tsv", sep="\t", header=None)
clustering_results.columns = ["cluster", "sequence"]

mark_ancestors = lambda x: 1 if "tree" in x else 0
is_ancestor = clustering_results["sequence"].apply(mark_ancestors)
clustering_results["is_ancestor"] = is_ancestor

representative_ids = clustering_results["cluster"].unique()

clustering_results


Unnamed: 0,cluster,sequence,is_ancestor
0,N1147_a4_tree_0,N1147_a4_tree_0,1
1,N1147_a4_tree_0,N1140_a4_tree_0,1
2,N1147_a4_tree_0,N1358_a4_tree_1,1
3,N1147_a4_tree_0,N1362_a4_tree_1,1
4,N1147_a4_tree_0,N1387_a4_tree_4,1
...,...,...,...
32120,N3313_a4_tree_0,N3907_a4_tree_30,1
32121,N3313_a4_tree_0,UniRef100_UPI001F080ADC/14-747,0
32122,N3313_a4_tree_0,UniRef100_A0A8C9ZFS8/9-558,0
32123,N3313_a4_tree_0,UniRef100_A0A8C9ZCJ7/7-555,0


In [4]:
aln = st.read_aln_file("/Users/sebs_mac/uni_OneDrive/honours/data/a4_human/alns/a4_extants.fasta", encode=False)
encodings, weights = st.encode_and_weight_seqs(aln, 0.2)

aln["encoding"] = encodings
aln["weights"] = weights

Reading the alignment: /Users/sebs_mac/uni_OneDrive/honours/data/a4_human/alns/a4_extants.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Number of seqs: 5230


In [41]:
clustering_results[clustering_results["is_ancestor"] == 0]

Unnamed: 0,cluster,sequence,is_ancestor
29,N1147_a4_tree_0,UniRef100_A0A7N8YDR4/6-565,0
30,N1147_a4_tree_0,UniRef100_A0A7N6BYD0/5-552,0
31,N1147_a4_tree_0,UniRef100_A0A3Q1JL59/5-546,0
32,N1147_a4_tree_0,UniRef100_A0A7N9ATZ4/6-548,0
1179,N1681_a4_tree_0,UniRef100_UPI000E1D8C12/26-745,0
...,...,...,...
32081,N3312_a4_tree_0,UniRef100_A0A672J9D9/8-550,0
32121,N3313_a4_tree_0,UniRef100_UPI001F080ADC/14-747,0
32122,N3313_a4_tree_0,UniRef100_A0A8C9ZFS8/9-558,0
32123,N3313_a4_tree_0,UniRef100_A0A8C9ZCJ7/7-555,0


In [37]:
clusters = [clustering_results.loc[clustering_results["cluster"] == rep] for rep in representative_ids]
len(clusters)

83

In [42]:
SAMPLE_SIZE = 10000
extant_proportions = [0.2185, 0.15, 0.1, 0.05, 0.0]


aln = st.read_aln_file("/Users/sebs_mac/uni_OneDrive/honours/data/a4_human/alns/a4_ancestors_extants_no_dupes.fasta", encode=False)

random.seed(42)

for p in extant_proportions:
    for r in range(1, 6):
        sample_ids = st.sample_extant_ancestors(clusters, SAMPLE_SIZE, extant_proportion=p)
        sample_seqs = aln.loc[aln["id"].isin(sample_ids)]
        st.write_fasta_file(f"./a4_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta", sample_seqs)

Reading the alignment: /Users/sebs_mac/uni_OneDrive/honours/data/a4_human/alns/a4_ancestors_extants_no_dupes.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Number of seqs: 32125


In [43]:
for p in extant_proportions:
    for r in range(1, 6):
        samp_aln = st.read_aln_file(f"./a4_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta")
        numpy_aln, _, _ = st.convert_msa_numpy_array(samp_aln)
        weights = st.reweight_by_seq_similarity(numpy_aln, 0.2)
        samp_aln["weights"] = weights
        samp_aln.to_pickle(f"a4_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}_encoded_weighted.pkl")

Reading the alignment: ./a4_ancestors_extants_no_dupes_clustered_r1_extant_0.2185.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 10000
Sequence weight numpy array created with shape (num_seqs, columns):  (10000, 770)
Reading the alignment: ./a4_ancestors_extants_no_dupes_clustered_r2_extant_0.2185.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 10000
Sequence weight numpy array created with shape (num_seqs, columns):  (10000, 770)
Reading the alignment: ./a4_ancestors_extants_no_dupes_clustered_r3_extant_0.2185.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 10000
Sequence weight numpy array created with shape (num_seqs, columns):  (10000, 770)
Reading the alignment: ./a4_ancestors_extants_no_dupes_clustered_r4_extant_0.2185.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Numb

# GCN4 clustering

In [57]:
clustering_results = pd.read_csv("/Users/sebs_mac/uni_OneDrive/honours/data/gcn4/mmseqs_clustering/gcn4_an_ex_cluster.tsv", sep="\t", header=None)
clustering_results.columns = ["cluster", "sequence"]

mark_ancestors = lambda x: 1 if "tree" in x else 0
is_ancestor = clustering_results["sequence"].apply(mark_ancestors)
clustering_results["is_ancestor"] = is_ancestor

representative_ids = clustering_results["cluster"].unique()

clustering_results


Unnamed: 0,cluster,sequence,is_ancestor
0,UniRef100_H2B208/41-248,UniRef100_H2B208/41-248,0
1,UniRef100_H2B208/41-248,UniRef100_J7RCV7/44-270,0
2,UniRef100_H2B208/41-248,N334_gcn4_tree_1,1
3,UniRef100_H2B208/41-248,N35_gcn4_tree_7,1
4,UniRef100_H2B208/41-248,N32_gcn4_tree_9,1
...,...,...,...
11899,N316_gcn4_tree_12,N296_gcn4_tree_12,1
11900,N315_gcn4_tree_11,N315_gcn4_tree_11,1
11901,N316_gcn4_tree_10,N316_gcn4_tree_10,1
11902,N315_gcn4_tree_86,N315_gcn4_tree_86,1


In [58]:
# divide the sequences into their clusters
clusters = [clustering_results.loc[clustering_results["cluster"] == rep] for rep in representative_ids]
len(clusters)

76

In [55]:
# remove duplcates 

# aln = st.read_aln_file("/Users/sebs_mac/uni_OneDrive/honours/data/gcn4/alns/gcn4_ancestors_extants.fasta", encode=False)
# aln = aln.drop_duplicates(subset=["sequence"])
# aln
# st.write_fasta_file("/Users/sebs_mac/uni_OneDrive/honours/data/gcn4/alns/gcn4_ancestors_extants_no_dupes.fasta", aln)

Reading the alignment: /Users/sebs_mac/uni_OneDrive/honours/data/gcn4/alns/gcn4_extants.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Number of seqs: 348


In [63]:

SAMPLE_SIZE = 5000
extant_proportions = [0.0662, 0.05, 0.025, 0.01, 0.0]


aln = st.read_aln_file("/Users/sebs_mac/uni_OneDrive/honours/data/gcn4/alns/gcn4_ancestors_extants_no_dupes.fasta", encode=False)

random.seed(42)

for p in extant_proportions:
    for r in range(1, 6):
        sample_ids = st.sample_extant_ancestors(clusters, SAMPLE_SIZE, extant_proportion=p)
        sample_seqs = aln.loc[aln["id"].isin(sample_ids)]
        st.write_fasta_file(f"./gcn4_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta", sample_seqs)



Reading the alignment: /Users/sebs_mac/uni_OneDrive/honours/data/gcn4/alns/gcn4_ancestors_extants_no_dupes.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Number of seqs: 11904


In [64]:
for p in extant_proportions:
    for r in range(1, 6):
        samp_aln = st.read_aln_file(f"./gcn4_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta")
        numpy_aln, _, _ = st.convert_msa_numpy_array(samp_aln)
        weights = st.reweight_by_seq_similarity(numpy_aln, 0.2)
        samp_aln["weights"] = weights
        samp_aln.to_pickle(f"gcn4_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}_encoded_weighted.pkl")

Reading the alignment: ./gcn4_ancestors_extants_no_dupes_clustered_r1_extant_0.0662.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 5000
Sequence weight numpy array created with shape (num_seqs, columns):  (5000, 281)
Reading the alignment: ./gcn4_ancestors_extants_no_dupes_clustered_r2_extant_0.0662.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 5000
Sequence weight numpy array created with shape (num_seqs, columns):  (5000, 281)
Reading the alignment: ./gcn4_ancestors_extants_no_dupes_clustered_r3_extant_0.0662.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 5000
Sequence weight numpy array created with shape (num_seqs, columns):  (5000, 281)
Reading the alignment: ./gcn4_ancestors_extants_no_dupes_clustered_r4_extant_0.0662.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Nu

# Alternate clustering: incomplete

In [None]:

test = st.read_aln_file("../data/pair_test.aln")
test_msa, _, _ = st.convert_msa_numpy_array(test)

@njit(parallel=True)
def adj_matrix(msa) -> np.ndarray:

    sim_matrix = np.ones((msa.shape[0], msa.shape[0]))
    seq_len = len(msa[0])

    for i in prange(msa.shape[0]):
        for j in prange(i + 1, msa.shape[0]):
            dist = 1 - (mt.hamming_distance(msa[i], msa[j]) / seq_len)
            sim_matrix[i, j] = sim_matrix[j, i] = dist 


    return sim_matrix

adj_matrix(test_msa)


In [None]:
mat = adj_matrix(msa)

In [None]:
clustering.n_leaves_

In [18]:
st.write_fasta_file(data+"a4_ancestors_extants_no_dupes.fasta", seqs)

In [60]:
data

Unnamed: 0,id,sequence,encoding
0,N0_a4_tree_2,----LALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,N1_a4_tree_2,----LALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,N2_a4_tree_2,----LALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,N3_a4_tree_2,----LALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,N4_a4_tree_2,----LALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...
5223,N5223_a4_tree_2,----------------------------------------------...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5224,N5224_a4_tree_2,----LALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5225,N5225_a4_tree_2,----------------------------------------------...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5226,N5226_a4_tree_2,----LALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
