# Extrinsic assessment- chemical contexts

In [None]:
#Numeric
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.spatial.distance import pdist, squareform
#DL
import keras
import keras_tuner as kt
import tensorflow as tf
#Sytem
from pymongo import MongoClient
import sys
#Tokenizers
import sentencepiece as spm
#Graphic
import matplotlib.pyplot as plt
#Custom
sys.path.append('/home/jmalagont/Documentos/GWord2Vec/algorithms/utils/')
import DataGenerator as dg
#Tokenizers
import sentencepiece as spm
#bioinformatics
from Bio.Seq import Seq
from Bio.Data import CodonTable
from aaindex import aaindex1, aaindex2

## 📁 Define Paths and Database Parameters

We define variables for:

- MongoDB database and collection names.
- BPE tokenizer model.
- A csv with the required train, tune, test partition IDs.
- Paths the GeneticPieces2Vec trained model.

In [None]:
db_name = "------"
collection_name = "------"
tokenizer_model_path = "------"
partitions_path = "------"
model_path = "------"

## Load model

This custom Keras layer computes the dot product between two embedding tensors using Einstein summation.
It is used to calculate similarity between target and context embeddings.

In [None]:
class dotlayer(keras.layers.Layer):
    def __init__(self , **kwargs):
        super(dotlayer, self).__init__(**kwargs)

    def call(self, x1, x2):
        return keras.ops.einsum('bfc,bec->be', x1, x2)

## Load GeneticPieces2Vec Model

In [None]:
keras.backend.clear_session()
custom_objects = {'dotlayer': dotlayer}
model = keras.models.load_model(f'{model_path}/W2Vstructure.keras', custom_objects=custom_objects)
embedding_model = model.layers[2]
embedding_model.summary()

## Load codons

initialize the codon-to-amino acid translation table and loads a pre-trained tokenizer used to convert sequences into tokenized representations.

In [None]:
standard_table = CodonTable.unambiguous_dna_by_id[1]

tokenizer = spm.SentencePieceProcessor()
tokenizer.load(f'{tokenizer_model_path}.model')

### Extract Codon-to-Amino Acid Mapping from Vocabulary

Filter the tokenizer vocabulary to identify valid codons (3-letter nucleotide sequences) and translates them into amino acids using the standard genetic code.


In [None]:
vocabulary = [(tokenizer.id_to_piece(i),i) for i in range(tokenizer.get_piece_size())]
codons = [[str(Seq(token).translate(table=standard_table)), ID] for token, ID in vocabulary if (len(token)==3) and not(token[0] in ['▁','<'])]
codons = np.array(codons)

Here we compute an average embedding vector for each unique amino acid based on the embeddings of its corresponding codons.

In [None]:
AA_list = np.unique(codons[:,0])
codons_embeddings = np.zeros([len(AA_list), 512])

for i, AA in enumerate(AA_list):
    AA_index = np.array([codons[codons[:,0]==AA][:,1]]).astype(np.int16)
    codons_embeddings[i] = np.mean(embedding_model(AA_index)[0].numpy(),axis=0)
    

## Cosine Similarity Matrix Calculation and Visualization

This function computes the cosine similarity matrix for a set of feature vectors, which here correspond to amino acid embeddings.

Then the function is used to obtein the GeneticPieces2Vec_distance_matrix, a cosine similarity matrix for amino acid embeddings, also the cosine similarity matrix based on biochemical properties of amino acids. The similarity matrix represents how closely related the amino acids are according to their measured properties. 

The resulting matrices are then visualized as heatmaps, allowing for easy comparison and interpretation of the similarity relationships between both of them.

In [None]:
def cos_matrix(features):
    features_norm = np.array([np.linalg.norm(features, axis=1)])
    features_norm = (features_norm.T @ features_norm)
    distance_matrix = ((features @ features.T))/(features_norm+1e-10)
    return(distance_matrix)

In [None]:
GeneticPieces2Vec_distance_matrix = cos_matrix(codons_embeddings)

plt.imshow(GeneticPieces2Vec_distance_matrix)
plt.xticks(ticks=np.arange(len(AA_list)), labels=AA_list)
plt.yticks(ticks=np.arange(len(AA_list)), labels=AA_list)

plt.show()

In [None]:
codes_dict = {code:aaindex1[code]['description'] for code in aaindex1.record_codes()}
get_properties = lambda aa: np.array([aaindex1[code]['values'][aa] for code in codes_dict.keys()])

propierties = np.array([get_properties(str(aa)) for aa in AA_list])
propierties.shape

In [None]:
propierties_distance_matrix = cos_matrix(propierties)

plt.imshow(propierties_distance_matrix)
plt.xticks(ticks=np.arange(len(AA_list)), labels=AA_list)
plt.yticks(ticks=np.arange(len(AA_list)), labels=AA_list)

plt.show()

## Mantel Test Implementation for Comparing Distance Matrices

This function calculates the Mantel statistic, which measures the correlation between two distance matrices derived from datasets representing the same objects. 

### Parameters:
- `X` and `Y`: Input data matrices where rows correspond to the same set of objects.
- `permutations`: Number of random permutations to perform for significance testing (default is 999).
- `tail`: Specifies the type of hypothesis test:
  - `'two-sided'`: Tests for any difference in correlation (positive or negative).

### Process:
1. Computes cosine distance matrices from the input datasets and flattens them into vectors.
2. Calculates the observed Mantel statistic as the Pearson correlation between these vectors.
3. Performs permutation testing by randomly shuffling rows (and corresponding columns) of one matrix to generate a null distribution of Mantel statistics.
4. Calculates the p-value based on the proportion of permuted statistics more extreme than the observed value, according to the chosen tail.

### Output:
- Returns the Mantel statistic and the associated p-value, providing a measure of how similar the two datasets are in terms of their pairwise distances.

In [None]:
import numpy as np
from scipy.stats import pearsonr
from scipy.spatial.distance import pdist, squareform

def mantel(X, Y, permutations=999, tail='two-sided'):
    """
    Calcula el estadístico de Mantel entre dos matrices de distancia.

    Parámetros:
    X (array_like): Primera matriz de datos (las filas corresponden a objetos).
    Y (array_like): Segunda matriz de datos (las filas corresponden a los mismos objetos que en X).
    permutations (int, opcional): Número de permutaciones para la prueba de significancia. Por defecto es 999.
    tail (str, opcional): Tipo de prueba ('two-sided', 'upper', 'lower'). Por defecto es 'two-sided'.

    Retorna:
    tuple: (estadístico de Mantel, valor p)
    """
    # Calcular las matrices de distancia
    distX = cos_matrix(X).flatten()
    distY = cos_matrix(Y).flatten()

    # Calcular el estadístico de Mantel observado (correlación de Pearson entre las distancias)
    mantel_stat_obs, _ = pearsonr(distX, distY)

    # Realizar las permutaciones
    perm_stats = np.empty(permutations)
    for i in range(permutations):
        # Permutar aleatoriamente las filas (y columnas correspondientes) de una de las matrices
        perm_indices = np.random.permutation(X.shape[0])
        permX = X[perm_indices, :]
        permDistX = cos_matrix(permX).flatten()
        perm_stat, _ = pearsonr(permDistX, distY)
        perm_stats[i] = perm_stat

    # Calcular el valor p
    if tail == 'two-sided':
        p_value = np.mean(np.abs(perm_stats) >= np.abs(mantel_stat_obs))
    elif tail == 'upper':
        p_value = np.mean(perm_stats >= mantel_stat_obs)
    elif tail == 'lower':
        p_value = np.mean(perm_stats <= mantel_stat_obs)
    else:
        raise ValueError("La cola debe ser 'two-sided', 'upper' o 'lower'.")

    return mantel_stat_obs, p_value

# Ejemplo de uso:
# Generar datos de ejemplo
np.random.seed(42)
data1 = np.random.rand(10, 3)
data2 = np.random.rand(10, 3) + 1.5 * data1

# Ejecutar el test de Mantel
mantel_statistic, p_value = mantel(codons_embeddings, propierties, permutations=1000)

print(f"Estadístico de Mantel: {mantel_statistic:.4f}")
print(f"Valor p: {p_value:.4f}")