# GeneticPieces2Vec Model

In [None]:
#Numeric
import numpy as np
import pandas as pd
#DL
import keras
import keras_tuner as kt
import tensorflow as tf
#Sytem
from pymongo import MongoClient
import sys
#Tokenizers
import sentencepiece as spm
#Graphic
import matplotlib.pyplot as plt
#Custom
sys.path.append('utils/')
import DataGenerator as dg

## 📁 Define Paths and Database Parameters

We define variables for:

- MongoDB database and collection names.
- BPE tokenizer model.
- A csv with the required train, tune, test partition IDs.
- Paths to save the model output files.

In [None]:
db_name = "------"
collection_name = "------"
tokenizer_model_path = "------"
partitions_path = "------"
model_path = "------"

## Load organisms ID

In [None]:
partitions = pd.read_csv(partitions_path)
training_IDs = list(partitions.loc[partitions['partition']=='Train', 'ID'])
tuning_IDs = list(partitions.loc[partitions['partition']=='Tune', 'ID'])
testing_IDs = list(partitions.loc[partitions['partition']=='Test', 'ID'])

print(f' Training: {len(training_IDs)} \n Tune: {len(tuning_IDs)} \n Test: {len(testing_IDs)}')

## Build model pieces

This custom Keras layer computes the dot product between two embedding tensors using Einstein summation.
It is used to calculate similarity between target and context embeddings.


In [None]:
class dotlayer(keras.layers.Layer):
    def __init__(self , **kwargs):
        super(dotlayer, self).__init__(**kwargs)

    def call(self, x1, x2):
        return keras.ops.einsum('bfc,bec->be', x1, x2)

## Build an Hypermodel

## HyperModel Definition for Word2Vec Training
Defines a KerasTuner `HyperModel` class to build and train a Skip-gram Word2Vec model.
Includes support for hyperparameter tuning of:
- Embedding size
- Learning rate
- L2 regularization lambda
- Context window size
- Number of negative samples

Uses a custom `DataGenerator` to stream training and validation data from MongoDB.


In [None]:
class W2VHyperModel(kt.HyperModel):
    def __init__(self, train_ids, val_ids): 
        keras.backend.clear_session(free_memory=True)
        self.train_ids = train_ids
        self.val_ids = val_ids

    def build(self, hp):
        embedding_size = hp.Int('embedding size', min_value=32,max_value=512,step=2,sampling="log")
        lr = hp.Float("learning rate",min_value=1e-3,max_value=1e-1,step=5,sampling="log")
        reg = hp.Float("lambda",min_value=1e-5,max_value=1e-3,step=10,sampling="log")
        context_size = hp.Choice('context size', values=[5,9,13,17,21,25])
        negative_samples = hp.Choice('negative samples', values=[3,5,7])

        self.train_gen = dg.DataGenerator(db_name, collection_name, organism_IDs = self.train_ids, tokenizer_path = tokenizer_model_path, shuffle = True, batch_size = 100, 
                                                     context_size=context_size, negative_samples=negative_samples, vocab_size=12000, max_pair=1000)
        self.val_gen = dg.DataGenerator(db_name, collection_name, organism_IDs = self.val_ids, tokenizer_path = tokenizer_model_path, shuffle = False, batch_size = 100, 
                                                   context_size=context_size, negative_samples=negative_samples, vocab_size=12000, max_pair=100)

        embedding_model = keras.models.Sequential()
        embedding_model.add(keras.layers.Input(shape=(None,)))
        embedding_model.add(keras.layers.Embedding(12000, embedding_size, embeddings_regularizer = keras.regularizers.L2(reg)))
        embedding_model.add(keras.layers.BatchNormalization())
            
        target_input = keras.layers.Input(shape=(1,))
        context_input = keras.layers.Input(shape=(None,))
        target_emb = embedding_model(target_input)
        context_emb = embedding_model(context_input)
        out = keras.activations.softmax(dotlayer()(target_emb, context_emb))

        model = keras.Model(inputs=[target_input, context_input], outputs=out)
        model.compile(optimizer=keras.optimizers.Adam(lr),loss=keras.losses.CategoricalCrossentropy(from_logits=False),metrics=['AUC'])
        return model

    def fit(self, hp, model, epochs=1, callbacks=None, **kwargs):
                
        return model.fit(self.train_gen,validation_data=self.val_gen,epochs=epochs, callbacks=callbacks)

### Training example

## Train the Model and Visualize Performance
Trains the model using early stopping and plots:
- Categorical Cross-Entropy Loss over epochs
- AUC (Area Under the Curve) metric over epochs

These metrics help evaluate model performance and overfitting.


In [None]:
hp = kt.HyperParameters()
hm = W2VHyperModel(training_IDs, tuning_IDs)
w2vmodel = hm.build(hp)
w2vmodel.summary()

In [None]:
keras.utils.plot_model(w2vmodel, show_shapes=True, to_file=f'{model_path}/W2V structure.png',)

In [None]:
early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='min',restore_best_weights=True)
train_history = hm.fit(hp, w2vmodel, epochs=1, callbacks=[early_stopping_callback])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4)) # 1 fila, 2 columnas

ax1.plot(train_history.history['loss'], color='blue', label = 'Train')
ax1.plot(train_history.history['val_loss'], color='red', label = 'Test')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('loss (categorical cross entropy)')
ax1.set_title('Loss during training')
ax1.grid(True)
ax1.legend()

ax2.plot(train_history.history['AUC'], color='blue', label = 'Train')
ax2.plot(train_history.history['val_AUC'], color='red', label = 'Test')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('AUC (ROC)')
ax2.set_title('AUC during training')
ax2.grid(True)
ax2.legend()

plt.tight_layout()

plt.show()


## Hyperparameter search

Runing on tmux session

Performs Random Search using `keras_tuner` to find optimal hyperparameters for the Word2Vec model.
Key configurations:
- `objective`: validation AUC
- `max_trials`: number of different hyperparameter combinations
- `executions_per_trial`: repeat each combination for stability
- Uses EarlyStopping and TensorBoard for monitoring

This script is designed to be run in a background tmux session for long experiments.


In [None]:
%%writefile /tmp/HP_search.py

############################################################################################################################## Imports and configurations #########################################################################################################################################

#Numeric
import numpy as np
import pandas as pd
#DL
import keras
import keras_tuner as kt
import tensorflow as tf
#Sytem
from pymongo import MongoClient
import sys
#Tokenizers
import sentencepiece as spm
#Graphic
import matplotlib.pyplot as plt
#Custom
sys.path.append("------")
import DataGenerator as dg

############################################################################################################################## Important paths #####################################################################################################################################################

db_name = "------"
collection_name = "------"
tokenizer_model_path = "------"
partitions_path = "------"
model_path = "------"

############################################################################################################################## Load data ###########################################################################################################################################################

partitions = pd.read_csv(partitions_path)
training_IDs = list(partitions.loc[partitions['partition']=='Train', 'ID'])
tuning_IDs = list(partitions.loc[partitions['partition']=='Tune', 'ID'])
testing_IDs = list(partitions.loc[partitions['partition']=='Test', 'ID'])

print(f' Training: {len(training_IDs)} \n Tune: {len(tuning_IDs)} \n Test: {len(testing_IDs)}')

############################################################################################################################## Model builder ########################################################################################################################################################

class dotlayer(keras.layers.Layer):
    def __init__(self , **kwargs):
        super(dotlayer, self).__init__(**kwargs)

    def call(self, x1, x2):
        return keras.ops.einsum('bfc,bec->be', x1, x2)

class W2VHyperModel(kt.HyperModel):
    def __init__(self, train_ids, val_ids):

        keras.backend.clear_session(free_memory=True)
        self.train_ids = train_ids
        self.val_ids = val_ids

    def build(self, hp):
        embedding_size = hp.Int('embedding size', min_value=32,max_value=512,step=2,sampling="log")
        lr = hp.Float("learning rate",min_value=1e-3,max_value=1e-1,step=5,sampling="log")
        reg = hp.Float("lambda",min_value=1e-5,max_value=1e-3,step=10,sampling="log")
        context_size = hp.Choice('context size', values=[5,9,13,17,21,25])
        negative_samples = hp.Choice('negative samples', values=[3,5,7])

        self.train_gen = dg.DataGenerator(db_name, collection_name, organism_IDs = self.train_ids, tokenizer_path = tokenizer_model_path, shuffle = True, batch_size = 100, 
                                                     context_size=context_size, negative_samples=negative_samples, vocab_size=12000, max_pair=1000)
        self.val_gen = dg.DataGenerator(db_name, collection_name, organism_IDs = self.val_ids, tokenizer_path = tokenizer_model_path, shuffle = False, batch_size = 100, 
                                                   context_size=context_size, negative_samples=negative_samples, vocab_size=12000, max_pair=100)

        embedding_model = keras.models.Sequential()
        embedding_model.add(keras.layers.Input(shape=(None,)))
        embedding_model.add(keras.layers.Embedding(12000, embedding_size, embeddings_regularizer = keras.regularizers.L2(reg)))
        embedding_model.add(keras.layers.BatchNormalization())
            
        target_input = keras.layers.Input(shape=(1,))
        context_input = keras.layers.Input(shape=(None,))
        target_emb = embedding_model(target_input)
        context_emb = embedding_model(context_input)
        out = keras.activations.softmax(dotlayer()(target_emb, context_emb))

        model = keras.Model(inputs=[target_input, context_input], outputs=out)
        model.compile(optimizer=keras.optimizers.Adam(lr),loss=keras.losses.CategoricalCrossentropy(from_logits=False),metrics=['AUC'])
        return model

    def fit(self, hp, model, epochs=1, callbacks=None, **kwargs):
                
        return model.fit(self.train_gen,validation_data=self.val_gen,epochs=epochs, callbacks=callbacks)

############################################################################################################################## Hyper paramter search ################################################################################################################################################

# Callbacks
early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='min',restore_best_weights=True)
tensorboard_callback = keras.callbacks.TensorBoard(f"{model_path}/HP_search/TensorBoard")
#Tuner
tuner = kt.RandomSearch(
        hypermodel=W2VHyperModel(training_IDs, tuning_IDs),
        objective="val_AUC",
        max_trials=20,
        executions_per_trial=3,
        overwrite=True,
        directory=f"{model_path}/HP_search",
        project_name="Genetic Word2Vec")

tuner.search_space_summary()
#Start search
tuner.search(epochs=20, callbacks=[early_stopping_callback, tensorboard_callback])

## Main train

Loads the best hyperparameters found in the tuning stage.
Trains the final model on the full training set using early stopping and saves:
- Full model architecture and weights in `.keras` format
- Weights separately in `.h5` format

This final model is used for downstream genomic analysis or phenotype prediction.

In [None]:
tuner = kt.RandomSearch(
        hypermodel=W2VHyperModel(training_IDs, tuning_IDs),
        objective="val_AUC",
        max_trials=20,
        executions_per_trial=3,
        overwrite=False,
        directory=f"{model_path}/HP_search",
        project_name="Genetic Word2Vec")

tuner.results_summary(3)

run on tmux session

In [None]:
%%writefile /tmp/final_train.py
############################################################################################################################## Imports and configurations #########################################################################################################################################

#Numeric
import numpy as np
import pandas as pd
#DL
import keras
import keras_tuner as kt
import tensorflow as tf
#Sytem
from pymongo import MongoClient
import sys
#Tokenizers
import sentencepiece as spm
#Graphic
import matplotlib.pyplot as plt
#Custom
sys.path.append('"------"')
import DataGenerator as dg

############################################################################################################################## Important paths #####################################################################################################################################################

db_name = "------"
collection_name = "------"
tokenizer_model_path = "------"
partitions_path = "------"
model_path = "------"

############################################################################################################################## Load data ###########################################################################################################################################################

partitions = pd.read_csv(partitions_path)
training_IDs = list(partitions.loc[partitions['partition']=='Train', 'ID'])
tuning_IDs = list(partitions.loc[partitions['partition']=='Tune', 'ID'])
testing_IDs = list(partitions.loc[partitions['partition']=='Test', 'ID'])

print(f' Training: {len(training_IDs)} \n Tune: {len(tuning_IDs)} \n Test: {len(testing_IDs)}')

############################################################################################################################## Model builder ########################################################################################################################################################

class dotlayer(keras.layers.Layer):
    def __init__(self , **kwargs):
        super(dotlayer, self).__init__(**kwargs)

    def call(self, x1, x2):
        return keras.ops.einsum('bfc,bec->be', x1, x2)

class W2VHyperModel(kt.HyperModel):
    def __init__(self, train_ids, val_ids):

        keras.backend.clear_session(free_memory=True)
        self.train_ids = train_ids
        self.val_ids = val_ids

    def build(self, hp):
        embedding_size = hp.Int('embedding size', min_value=32,max_value=512,step=2,sampling="log")
        lr = hp.Float("learning rate",min_value=1e-3,max_value=1e-1,step=5,sampling="log")
        reg = hp.Float("lambda",min_value=1e-5,max_value=1e-3,step=10,sampling="log")
        context_size = hp.Choice('context size', values=[5,9,13,17,21,25])
        negative_samples = hp.Choice('negative samples', values=[3,5,7])

        self.train_gen = dg.DataGenerator(db_name, collection_name, organism_IDs = self.train_ids, tokenizer_path = tokenizer_model_path, shuffle = True, batch_size = 100, 
                                                     context_size=context_size, negative_samples=negative_samples, vocab_size=12000, max_pair=1000)
        self.val_gen = dg.DataGenerator(db_name, collection_name, organism_IDs = self.val_ids, tokenizer_path = tokenizer_model_path, shuffle = False, batch_size = 100, 
                                                   context_size=context_size, negative_samples=negative_samples, vocab_size=12000, max_pair=100)

        embedding_model = keras.models.Sequential()
        embedding_model.add(keras.layers.Input(shape=(None,)))
        embedding_model.add(keras.layers.Embedding(12000, embedding_size, embeddings_regularizer = keras.regularizers.L2(reg)))
        embedding_model.add(keras.layers.BatchNormalization())
            
        target_input = keras.layers.Input(shape=(1,))
        context_input = keras.layers.Input(shape=(None,))
        target_emb = embedding_model(target_input)
        context_emb = embedding_model(context_input)
        out = keras.activations.softmax(dotlayer()(target_emb, context_emb))

        model = keras.Model(inputs=[target_input, context_input], outputs=out)
        model.compile(optimizer=keras.optimizers.Adam(lr),loss=keras.losses.CategoricalCrossentropy(from_logits=False),metrics=['AUC'])
        return model

    def fit(self, hp, model, epochs=1, callbacks=None, **kwargs):
                
        return model.fit(self.train_gen,validation_data=self.val_gen,epochs=epochs, callbacks=callbacks)
############################################################################################################################## Final train ################################################################################################################################################
# Load tuner
tuner = kt.RandomSearch(
        hypermodel=W2VHyperModel(training_IDs, tuning_IDs),
        objective="val_AUC",
        max_trials=20,
        executions_per_trial=3,
        overwrite=False,
        directory=f"{model_path}/HP_search",
        project_name="Genetic Word2Vec")
# Callbacks
early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='min',restore_best_weights=True, min_delta=0.01)
tensorboard_callback = keras.callbacks.TensorBoard(f"{model_path}/final model/TensorBoard")
# Build model
best_hp = tuner.get_best_hyperparameters()[0]
hm = W2VHyperModel(training_IDs, tuning_IDs)
w2vmodel = hm.build(best_hp)
keras.utils.plot_model(w2vmodel, show_shapes=True, to_file=f'{model_path}/final model/W2V structure.png')
# Start train
hm.fit(best_hp, w2vmodel, epochs=1_000_000, callbacks=[early_stopping_callback, tensorboard_callback])
w2vmodel.save(f"{model_path}/final model/W2Vstructure.keras")
w2vmodel.save_weights(f"{model_path}/final model/W2V.weights.h5")