# Extrinsic assessment - population structure

In [None]:
#Numeric
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.spatial.distance import pdist, squareform
#DL
import keras
import keras_tuner as kt
import tensorflow as tf
#Sytem
from pymongo import MongoClient
import sys
#Tokenizers
import sentencepiece as spm
#Graphic
import matplotlib.pyplot as plt
#Custom
sys.path.append('/home/jmalagont/Documentos/GWord2Vec/algorithms/utils/')
import DataGenerator as dg
#Tokenizers
import sentencepiece as spm
#bioinformatics
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import AlignIO
from Bio import Phylo

## 📁 Define Paths and Database Parameters

We define variables for:

- MongoDB database and collection names.
- BPE tokenizer model.
- A csv with the required train, tune, test partition IDs.
- Paths the GeneticPieces2Vec trained model.
- Reference genome sequences.

In [1]:
db_name = "------"
collection_name = "------"
tokenizer_model_path = "------"
partitions_path = "------"
model_path = "------"
fasta_path = "------"

## Load model

This custom Keras layer computes the dot product between two embedding tensors using Einstein summation.
It is used to calculate similarity between target and context embeddings.

In [None]:
class dotlayer(keras.layers.Layer):
    def __init__(self , **kwargs):
        super(dotlayer, self).__init__(**kwargs)

    def call(self, x1, x2):
        return keras.ops.einsum('bfc,bec->be', x1, x2)

## Load GeneticPieces2Vec Model

In [None]:
keras.backend.clear_session()
custom_objects = {'dotlayer': dotlayer}
model = keras.models.load_model(f'{model_path}/W2Vstructure.keras', custom_objects=custom_objects)
embedding_model = model.layers[2]
embedding_model.summary()

## Classical distance matrix

In [None]:
client = MongoClient("mongodb://localhost:27017/")

db = client[db_name]
collection = db[collection_name]

Here we compute the average variant counts per gene across a dataset:

1. Retrieves a list of unique gene IDs from the database collection.
2. Iterates through each gene ID, querying all associated records.
3. For each record, calculates the normalized variant count by dividing the variant count by the length of the first haplotype sequence.
4. Computes the mean of these normalized variant counts for each gene.
5. Stores these mean values in an array for further analysis.

This approach provides a classical metric reflecting the average genetic variation per gene, useful for  comparing genetic diversity across genes.

And then, the arrays is sorted and the 10 genes that exhibit the greatest genetic variation are selected

In [None]:
genes_IDs = np.array(collection.distinct("gene_ID"))
mean_variant_counts = []
for i, gene_ID in enumerate(genes_IDs):
    print(f'{i+1} of {len(genes_IDs)}', end='\r')
    
    query = {'gene_ID': gene_ID}
    gene_registers = collection.find(query, {'variant_counts':1, 'haplotype_1':1})
    variant_counts = [register['variant_counts']/len(register['haplotype_1']) for register in gene_registers]
    mean_variant_counts.append(np.mean(variant_counts))

mean_variant_counts = np.array(mean_variant_counts)

In [None]:
index = np.argsort(mean_variant_counts)[::-1]
more_variant_genes = genes_IDs[index][:10]
print(more_variant_genes)

### Extracting and Saving Haplotype Sequences


1. Queries the database for all records of the second most variable gene from the `more_variant_genes` list.
2. Iterates over each record to extract the DNA sequences of two haplotypes (`haplotype_1` and `haplotype_2`).
3. Converts each haplotype sequence to uppercase and wraps it in a `SeqRecord` object, labeling it with the organism ID.
4. Collects all `SeqRecord` objects for both haplotypes into separate lists.
5. Writes these collections of haplotype sequences to separate FASTA files (`haplotype_1.fasta` and `haplotype_2.fasta`), enabling downstream sequence analysis or alignment.

This process organizes genetic data by haplotype, facilitating further comparative or evolutionary studies.


In [None]:
query = {'gene_ID': more_variant_genes[1]}
gene_registers = collection.find(query)

haplotype_1_seqs, haplotype_2_seqs = [], []

for register in gene_registers:
    haplotype_1_seq = SeqRecord(Seq(register['haplotype_1'].upper()), id=register['organism_ID'])
    haplotype_2_seq = SeqRecord(Seq(register['haplotype_2'].upper()), id=register['organism_ID'])

    haplotype_1_seqs.append(haplotype_1_seq)
    haplotype_2_seqs.append(haplotype_2_seq)

with open(f"{fasta_path}/haplotype_1.fasta", "w") as file:
    SeqIO.write(haplotype_1_seqs, file, "fasta")

with open(f"{fasta_path}/haplotype_2.fasta", "w") as file:
    SeqIO.write(haplotype_2_seqs, file, "fasta")

This function transforms a given distance matrix, into a symmetric NumPy 2D array.

In [None]:
def to_numpy_matrix(distance_matrix):
    matrix = np.zeros([len(distance_matrix), len(distance_matrix)])
    for i in range(len(distance_matrix)):
        for j in range(len(distance_matrix)):
            matrix[i, j] = distance_matrix[i][j]
            matrix[j, i] = distance_matrix[i][j]
    return matrix

Phylogenetic Distance Calculation and Conversion

- Reads haplotype sequence alignments from FASTA files for two haplotypes.
- Uses Biopython's `DistanceCalculator` to compute pairwise evolutionary distances between sequences based on their alignments.
- Obtains distance matrices representing the dissimilarities between sequences within each haplotype group.
- Converts the resulting distance matrices into symmetric NumPy arrays for easier manipulation and downstream analysis.

The resulting matrices are then visualized as heatmaps.

In [None]:
haplotype_1_seqs = AlignIO.read(f"{fasta_path}/haplotype_1.fasta", "fasta")
haplotype_2_seqs = AlignIO.read(f"{fasta_path}/haplotype_2.fasta", "fasta")

calculador_distancias = DistanceCalculator()
teorical_haplotype_1_distances = calculador_distancias.get_distance(haplotype_1_seqs)
teorical_haplotype_2_distances = calculador_distancias.get_distance(haplotype_2_seqs)

teorical_haplotype_1_distances = to_numpy_matrix(teorical_haplotype_1_distances)
teorical_haplotype_2_distances = to_numpy_matrix(teorical_haplotype_2_distances)

In [None]:
fig, axes = plt.subplots(1, 2)

axes[0].imshow(teorical_haplotype_1_distances)
axes[0].axis('off')
axes[0].set_title('haplotype 1')

axes[1].imshow(teorical_haplotype_2_distances)
axes[1].axis('off')
axes[1].set_title('haplotype 2')

plt.tight_layout()
plt.show()

## Embedding distance matrix

### Embedding Generation for Haplotype Sequences

- Loads the standard codon translation table and the pre-trained tokenizer model.
- Loads aligned FASTA sequences for haplotype 1 and haplotype 2.
- Initializes empty arrays to store embedding vectors for each organism.
- For each organism:
  - Converts the DNA sequence into a list of token IDs using the tokenizer.
  - Passes the tokenized sequence through a trained embedding model.
  - Computes the average embedding vector across the sequence and stores it.
- The process is performed separately for both haplotype 1 and haplotype 2 sequences, resulting in two embedding matrices representing the genetic profiles of the organisms.


In [None]:
standard_table = CodonTable.unambiguous_dna_by_id[1]

tokenizer.load(f'{tokenizer_model_path}.model')

In [None]:
haplotype_1_seqs = AlignIO.read(f"{fasta_path}/haplotype_1.fasta", "fasta")
haplotype_2_seqs = AlignIO.read(f"{fasta_path}/haplotype_2.fasta", "fasta")

n_organism = len(haplotype_1_seqs)
embeddings_haplotype_1 = np.zeros([n_organism, 512])
embeddings_haplotype_2 = np.zeros([n_organism, 512])

for i in range(n_organism):
    sequence_haplotype_1 = str(haplotype_1_seqs[i].seq)
    sequence_haplotype_2 = str(haplotype_2_seqs[i].seq)
    
    sequence_haplotype_1_tokenized = np.array([tokenizer.encode_as_ids(sequence_haplotype_1)])
    sequence_haplotype_1_embedding = np.mean(embedding_model(sequence_haplotype_1_tokenized)[0].numpy(), axis=0)
    embeddings_haplotype_1[i] = sequence_haplotype_1_embedding

    sequence_haplotype_2_tokenized = np.array([tokenizer.encode_as_ids(sequence_haplotype_2)])
    sequence_haplotype_2_embedding = np.mean(embedding_model(sequence_haplotype_2_tokenized)[0].numpy(), axis=0)
    embeddings_haplotype_2[i] = sequence_haplotype_2_embedding

## Cosine Similarity Matrix Calculation and Visualization

This function computes the cosine similarity matrix for a set of feature vectors, which here correspond to our sequence embeddings.

Then the function is used to obtein the GeneticPieces2Vec_distance_matrix, a cosine similarity matrix for sequence embeddings. 

The resulting matrices are then visualized as heatmaps, allowing for easy comparison and interpretation of the similarity relationships between them and the theorical haplotype matrices.

In [None]:
def cos_matrix(features):
    features_norm = np.array([np.linalg.norm(features, axis=1)])
    features_norm = (features_norm.T @ features_norm)
    distance_matrix = ((features @ features.T))/(features_norm+1e-10)
    return(distance_matrix)

In [None]:
haplotype_1_distances = cos_matrix(embeddings_haplotype_1)
haplotype_2_distances = cos_matrix(embeddings_haplotype_2)

fig, axes = plt.subplots(1, 2)

axes[0].imshow(haplotype_1_distances)
axes[0].axis('off')
axes[0].set_title('haplotype 1')

axes[1].imshow(haplotype_2_distances)
axes[1].axis('off')
axes[1].set_title('haplotype 2')

plt.tight_layout()
plt.show()

### Mantel Test Function

Implements the Mantel test to evaluate the correlation between the matrices.

- Accepts as input:
  - `X`: a feature matrix from which a cosine distance matrix is computed.
  - `distY`: a precomputed reference distance matrix.
  - `permutations`: number of permutations for significance testing (default: 999).
  - `tail`: specifies the type of statistical test (`'two-sided'`, `'upper'`, or `'lower'`).
- Flattens both distance matrices to compute their Pearson correlation.
- Executes multiple random permutations of `X` to generate a null distribution of correlation values.
- Computes the p-value based on how extreme the observed correlation is relative to the null distribution.
- Returns the observed Mantel statistic and the associated p-value, quantifying the similarity between the structures represented by `X` and `distY`.


In [None]:
import numpy as np
from scipy.stats import pearsonr
from scipy.spatial.distance import pdist, squareform

def mantel(X, distY, permutations=999, tail='two-sided'):
    # Calcular las matrices de distancia
    distX = cos_matrix(X).flatten()
    distY = distY.flatten()

    # Calcular el estadístico de Mantel observado (correlación de Pearson entre las distancias)
    mantel_stat_obs, _ = pearsonr(distX, distY)

    # Realizar las permutaciones
    perm_stats = np.empty(permutations)
    for i in range(permutations):
        # Permutar aleatoriamente las filas (y columnas correspondientes) de una de las matrices
        perm_indices = np.random.permutation(X.shape[0])
        permX = X[perm_indices, :]
        permDistX = cos_matrix(permX).flatten()
        perm_stat, _ = pearsonr(permDistX, distY)
        perm_stats[i] = perm_stat

    # Calcular el valor p
    if tail == 'two-sided':
        p_value = np.mean(np.abs(perm_stats) >= np.abs(mantel_stat_obs))
    elif tail == 'upper':
        p_value = np.mean(perm_stats >= mantel_stat_obs)
    elif tail == 'lower':
        p_value = np.mean(perm_stats <= mantel_stat_obs)
    else:
        raise ValueError("La cola debe ser 'two-sided', 'upper' o 'lower'.")

    return mantel_stat_obs, p_value

The Mantel test is then applied to quantify the correlation between the embedded distances generated by the GeneticPieces2Vec and the theoretical distances derived from multiple sequence alignments.

#### Results:

  - **Mantel statistic**: A value reflecting the degree of linear correlation between the two distance matrices.
  - **p-value**: The probability of observing a statistic as extreme as the one obtained under the null hypothesis of no correlation.

These values help assess whether the embeddings produced by the model preserve, to some extent, the evolutionary structure captured by alignment-based distance metrics.

In [None]:
mantel_statistic_h1, p_value_h1 = mantel(embeddings_haplotype_1, teorical_haplotype_1_distances, permutations=1000)
mantel_statistic_h2, p_value_h2 = mantel(embeddings_haplotype_2, teorical_haplotype_2_distances, permutations=1000)

print(f"Estadístico de Mantel: {mantel_statistic_h1:.4f}, Valor p: {p_value_h1:.4f}")
print(f"Estadístico de Mantel: {mantel_statistic_h2:.4f}, Valor p: {p_value_h2:.4f}")