# Leveraging ancestral sequence reconstruction for protein representation learning

In [1]:
import sys
sys.path.append("./")

import pandas as pd
from pathlib import Path

from lase_tx import (
    data_processing,
    lase_model,
    training,
)

In [2]:
# lase parameters
n_layers = 6
hidden_dim = 128
num_heads = 4
vocab_size = 20
dropout_pr = 0.1

In [3]:
# PLM training data
asr_fasta = Path("../All_combined_processed_ancs_NR100.fasta")
save_dir = Path("../output/")

# get max seq len
asr_seq_arr = data_processing.prepare_seqs(fasta_path=asr_fasta)
max_seq_len = asr_seq_arr.shape[1]

# Regression data
reg_path = Path("./data/PTE_training_dset.csv")

## PLM training

In [27]:
# build encoder model 
encoder = training.build_encoder(
    n_layers,
    hidden_dim,
    num_heads,
    max_seq_len,
    dropout_pr,
)

# train and save model
training.train_encoder(
    encoder=encoder,
    fasta_path=asr_fasta,
    weight_dir=save_dir,
    init_seed=0,
    max_epochs=25
)



Model: "encoder_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 positional_embedding_2 (Po  multiple                  2688      
 sitionalEmbedding)                                              
                                                                 
 encoder_layer_12 (EncoderL  multiple                  396032    
 ayer)                                                           
                                                                 
 encoder_layer_13 (EncoderL  multiple                  396032    
 ayer)                                                           
                                                                 
 encoder_layer_14 (EncoderL  multiple                  396032    
 ayer)                                                           
                                                                 
 encoder_layer_15 (EncoderL  multiple                  39

0it [00:00, ?it/s]


InvalidArgumentError: Exception encountered when calling layer 'embedding_2' (type Embedding).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[22,166] = 21 is not in [0, 21) [Op:ResourceGather] name: 

Call arguments received by layer 'embedding_2' (type Embedding):
  • inputs=tf.Tensor(shape=(32, 365), dtype=int32)

## Extract representations

In [5]:
# load regression sequences and labels

df = pd.read_csv(reg_path)
reg_seq_ls = df.Sequence.tolist()
reg_label_arr = df.Fitness.to_numpy()

reg_seq_arr = data_processing.prepare_seqs(seq_ls=reg_seq_ls)

In [None]:
# load pre-trained model

encoder_model = lase_model.Encoder(
    n_layers,
    hidden_dim,
    num_heads,
    vocab_size,
    max_seq_len,
    dropout_pr,
)

encoder_model.load_weights(save_dir + "")

# extract representations for regression data
_, rep_arr, _ = encoder_model(reg_seq_arr)

# Train Scikit-learn regression models

Scikit learn regressor models are trained using the shell script `./sklearn_regressors/sklearn_parallel.sh`.

# *In Silico* Evolution