In [1]:
import pandas as pd 
import numpy as np 
import evoVAE.utils.metrics as mt 
import evoVAE.utils.seq_tools as st
from numba import njit, prange, jit
import random



In [2]:
aln: pd.DataFrame = pd.read_pickle("/Users/sebs_mac/uni_OneDrive/honours/data/gb1/encoded_weighted/gb1_ancestors_extants_encoded_weighted_no_dupes.pkl")
aln = aln.drop_duplicates(subset=['sequence'])
#aln = aln.sample(frac=0.2)
aln.drop(columns=["encoding", "weights"], inplace=True)
print(aln.shape)
aln.head()

msa, seq_key, key_label = st.convert_msa_numpy_array(aln)
msa.shape

(14276, 2)
Sequence weight numpy array created with shape (num_seqs, columns):  (14276, 448)


(14276, 448)

In [41]:
st.write_fasta_file("gb1_ancestors_extants_no_dupes.fasta", aln)

In [3]:
results = pd.read_csv("gb1_an_ex_cluster.tsv", sep="\t", header=None)
results.columns = ["cluster", "sequence"]
mark_ancestors = lambda x: 1 if "tree" in x else 0
is_ancestor = results["sequence"].apply(mark_ancestors)
results["is_ancestor"] = is_ancestor

results

Unnamed: 0,cluster,sequence,is_ancestor
0,N21_gb1_tree_1,N21_gb1_tree_1,1
1,N21_gb1_tree_1,N22_gb1_tree_1,1
2,N21_gb1_tree_1,N23_gb1_tree_1,1
3,N21_gb1_tree_1,N24_gb1_tree_1,1
4,N21_gb1_tree_1,N25_gb1_tree_1,1
...,...,...,...
14271,N1356_gb1_tree_1,UniRef100_L7ZAY0/5-397,0
14272,N852_gb1_tree_8,N852_gb1_tree_8,1
14273,N852_gb1_tree_8,N851_gb1_tree_8,1
14274,N852_gb1_tree_8,N955_gb1_tree_11,1


In [4]:
representative_ids = results["cluster"].unique()
representative_ids.shape

(55,)

In [5]:
clusters = [results.loc[results["cluster"] == cluster] for cluster in representative_ids]

In [14]:
SAMPLE_SIZE = 7000
extant_proportions = [0.22, 0.17, 0.12, 0.07, 0.02, 0.0]


random.seed(42)

for p in extant_proportions:
    for r in range(1, 6):
        sample_ids = st.sample_extant_ancestors(clusters, SAMPLE_SIZE, extant_proportion=p)
        sample_seqs = aln.loc[aln["id"].isin(sample_ids)]
        st.write_fasta_file(f"./clusters/gb1_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta", sample_seqs)

In [15]:
for p in extant_proportions:
    for r in range(1, 6):
        samp_aln = st.read_aln_file(f"./clusters/gb1_ancestors_extants_no_dupes_clustered_r{r}_extant_{p}.fasta")
        weights = st.reweight_by_seq_similarity(samp_aln["sequence"], 0.2)
        samp_aln["weights"] = weights
        print(samp_aln.head())
        break

    break 

Reading the alignment: ./clusters/gb1_ancestors_extants_no_dupes_clustered_r1_extant_0.22.fasta
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 7000
               id                                           sequence  \
0   N3_gb1_tree_1  MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...   
1   N9_gb1_tree_1  MEKNKKVSYFLRQSAVGLASVSAAFLVGTTSVGALDAEPTTAFIRE...   
2  N21_gb1_tree_1  MEKEKKVKYFLRKSAFGLASVSAAFLVGTTSVNAADASADPATIQQ...   
3  N22_gb1_tree_1  MEKEKKVKYFLRKSAFGLASVSAAFLVGTTSVNAADASAEPATIQA...   
4  N23_gb1_tree_1  MEKEKKVKYFLRKSAFGLASVSAAFLVGTTLENTITVSAEPATIPA...   

                                            encoding  
0  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  
1  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  
2  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  
3  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  
4  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  


In [16]:

test = st.read_aln_file("../data/pair_test.aln")
test_msa, _, _ = st.convert_msa_numpy_array(test)

@njit(parallel=True)
def adj_matrix(msa) -> np.ndarray:

    sim_matrix = np.ones((msa.shape[0], msa.shape[0]))
    seq_len = len(msa[0])

    for i in prange(msa.shape[0]):
        for j in prange(i + 1, msa.shape[0]):
            dist = 1 - (mt.hamming_distance(msa[i], msa[j]) / seq_len)
            sim_matrix[i, j] = sim_matrix[j, i] = dist 


    return sim_matrix

adj_matrix(test_msa)


Reading the alignment: ../data/pair_test.aln
Checking for bad characters: ['B', 'J', 'X', 'Z', 'U']
Performing one hot encoding
Number of seqs: 3
Sequence weight numpy array created with shape (num_seqs, columns):  (3, 4)


array([[1.  , 1.  , 0.25],
       [1.  , 1.  , 0.25],
       [0.25, 0.25, 1.  ]])

In [17]:
mat = adj_matrix(msa)

In [33]:
from sklearn.cluster import AgglomerativeClustering

model = AgglomerativeClustering(metric="precomputed", linkage="complete")
clustering = model.fit(mat)

In [38]:
clustering.n_leaves_

14276