## Acknowledgments

Parts of this codebase were adapted from:

- https://github.com/vishakhpk/iter-extrapolation — which implements the iterative controlled extrapolation method
- https://github.com/huggingface/transformers — for model loading, fine-tuning, and tokenization

We thank the original authors for making their work openly available.

In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
from utils.model_generation_modules import load_model
from utils.model_generation_modules import get_tgt_seq, get_mut_fromseq, get_seq_frommut, generating_muts

  from .autonotebook import tqdm as notebook_tqdm


#### Load Model

In [3]:
# Define Device Used to Load Model (GPU if available)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Give Dataset Name and Foundation Model ID
dataset_name = "PbrR"
model_id = "Rostlab/prot_t5_xl_uniref50"

In [4]:
# Load the Trained Model
tokenizer, loaded_model = load_model(model_id, dataset_name, device)



PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(130, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(130, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): Linear(
                    in_features=1024, out_features=4096, bias=False
                    (lora_dropout): Dropout(p=0.05, inplace=False)
                    (lora_A): Linear(in_features=1024, out_features=16, bias=False)
                    (lora_B): Linear(in_features=16, out_features=4096, bias=False)
                  )
                  (k): Linear(in_features=1024, out_features=4096, bias=False)
                  (v): Linear(
                    in_features=1024, out_features=4096, bias=False
                    (lora_dropout): Dropout(p=0.05, inplace=False)
                    (lora_A):

#### Generate Sequences

In [5]:
# Get Seed Sequences (Better than WT)
data = pd.read_csv('input.csv')
data.columns = ["Variant", "Pb", "Zn", "seq"]
starting_points = data.query("Pb > 2.0").query("Zn < 0.5")

# Get the list of mutants (sorted) that has been seen already in experiments
sorted_mutant_list = ['_'.join(sorted(i.split('_'))) for i in data.Variant.tolist()]

In [6]:
# Generate New Mutations
generated_mutants = generating_muts(0, 1.0, starting_points, tokenizer, device, loaded_model, sorted_mutant_list)



Processed 0 sequences, number of new mutations = 0
Processed 1 sequences, number of new mutations = 0
Processed 2 sequences, number of new mutations = 0
Processed 3 sequences, number of new mutations = 0
Processed 4 sequences, number of new mutations = 1
Processed 5 sequences, number of new mutations = 1
Processed 6 sequences, number of new mutations = 1
Processed 7 sequences, number of new mutations = 1


In [7]:
generated_mutants

array(['D135W_V46I'], dtype='<U10')