# Intrinsic assessment

In [None]:
#Numeric
import numpy as np
import pandas as pd
#DL
import keras
import keras_tuner as kt
import tensorflow as tf
#Sytem
from pymongo import MongoClient
import sys
#Tokenizers
import sentencepiece as spm
#Graphic
import matplotlib.pyplot as plt
#Custom
sys.path.append('/home/jmalagont/Documentos/GWord2Vec/algorithms/utils/')
import DataGenerator as dg

## 📁 Define Paths and Database Parameters

We define variables for:

- MongoDB database and collection names.
- BPE tokenizer model.
- A csv with the required train, tune, test partition IDs.
- Paths the model output files.

In [None]:
db_name = "------"
collection_name = "------"
tokenizer_model_path = "------"
partitions_path = "------"
model_path = "------"

## Load organisms ID

In [None]:
partitions = pd.read_csv(partitions_path)
training_IDs = list(partitions.loc[partitions['partition']=='Train', 'ID'])
tuning_IDs = list(partitions.loc[partitions['partition']=='Tune', 'ID'])
testing_IDs = list(partitions.loc[partitions['partition']=='Test', 'ID'])

print(f' Training: {len(training_IDs)} \n Tune: {len(tuning_IDs)} \n Test: {len(testing_IDs)}')

# Model pieces

This custom Keras layer computes the dot product between two embedding tensors using Einstein summation.
It is used to calculate similarity between target and context embeddings.

In [None]:
class dotlayer(keras.layers.Layer):
    def __init__(self , **kwargs):
        super(dotlayer, self).__init__(**kwargs)

    def call(self, x1, x2):
        return keras.ops.einsum('bfc,bec->be', x1, x2)

## Load GeneticPieces2Vec Model

In [None]:
keras.backend.clear_session()
custom_objects = {'dotlayer': dotlayer}
model = keras.models.load_model(f'{model_path}/W2Vstructure.keras', custom_objects=custom_objects)
model.summary()

## Model evaluation

The following function manually computes the data needed to plot a Receiver Operating Characteristic (ROC) curve, which is useful for evaluating the performance of the model

In [None]:
def ROC_curve(y_true, y_pred, thresholds):
    S = []
    E = []
    for threshold in thresholds:
        pred = 1*(y_pred>=threshold)
        vp = np.sum((y_true==1) * (pred==1))
        vn = np.sum((y_true==0) * (pred==0))
        fp = np.sum((y_true==0) * (pred==1))
        fn = np.sum((y_true==1) * (pred==0))
    
        s = vp/(vp+fn+1e-10)
        e = vn/(vn+fp+1e-10)
        
        S.append(s)
        E.append(e)
    
    S = np.array(S)
    E = np.array(E)
    return(S,E)

This section evaluates the model performance on a test dataset using ROC curves over multiple iterations to estimate average behavior and variability.

In [None]:
test_gen = dg.DataGenerator(db_name, collection_name, organism_IDs = testing_IDs, tokenizer_path = tokenizer_model_path,
                            shuffle = True, batch_size = 100, context_size = 9, negative_samples = 5, 
                            vocab_size = 12000, max_pair = 100)
n_batches = test_gen.__len__()
n_tries = 5
thresholds = np.arange(0, 1.001, 0.001)

fprs = []
tprs = []
for j in range(n_tries):
    print(f'Numero de ejecuciones: {j+1}', end='\r')
    
    predictions = []
    labels = []
    for i in range(n_batches):
        X_batch,Y_batch = test_gen.__getitem__(i)
        prediction_batch = model(X_batch)
    
        labels = labels + list(Y_batch.flatten())
        predictions = predictions + list(prediction_batch.numpy().flatten())
        
    fpr, tpr = ROC_curve(np.array(labels), np.array(predictions), thresholds)
    fprs.append(fpr)
    tprs.append(tpr)

    test_gen.on_epoch_end()

fprs = np.array(fprs)
tprs = np.array(tprs)

plt.plot(np.mean(1-fprs, axis=0), np.mean(tprs,axis=0), 'b-')
plt.plot([0,1], [0,1], color='orange')
plt.plot(np.mean(1-fprs, axis=0)-np.std(1-fprs, axis=0), np.mean(tprs,axis=0)-np.std(tprs,axis=0), 'r--')
plt.plot(np.mean(1-fprs, axis=0)+np.std(1-fprs, axis=0), np.mean(tprs,axis=0)+np.std(tprs,axis=0), 'r--')

In [None]:
model.evaluate(test_gen)