# SEMA-1D 
SEMA-1D is a fine-tuned ESM-2 model aimed to predict epitope resiudes based on antigen protein sequence

In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import torch
import torch.nn as nn

import os
# set cuda params
# 'TORCH_HOME'directory will be used to save origenal esm-1v weights
os.environ['TORCH_HOME'] = "./torch_hub"
#os.environ['CUDA_VISIBLE_DEVICES'] = "4"

#create direactory to weights storage
if not os.path.exists("./models/"):
    os.makedirs("./models/")

In [2]:
# load pre-trained model
import esm
# https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t36_3B_UR50D.pt
# 	https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt

# database UniRef90
model_esm1v, esm1v_alphabet = esm.pretrained.esm1v_t33_650M_UR90S_1()

batch_converter = esm1v_alphabet.get_batch_converter()



In [3]:

from custom_dataset import PdbDataset

agg_names = {'resi_pos': list, 'resi_aa': list, 'contact_number': list}

# train data
train_set = pd.read_csv('./data/sema_2.0/train_set.csv')
agg_names = {'resi_pos': list, 'resi_aa': list, 'contact_number': list}
train_set = train_set.groupby('pdb_id_chain').agg(agg_names).reset_index()
train_ds = PdbDataset(batch_converter, train_set[['resi_aa', 'contact_number']])

# test data
test_set = pd.read_csv('./data/sema_2.0/test_set.csv')
agg_names = {'resi_pos': list, 'resi_aa': list, 'contact_number_binary': list}
test_set = test_set.groupby('pdb_id_chain').agg(agg_names).reset_index()
test_ds = PdbDataset(batch_converter, test_set[['resi_aa', 'contact_number_binary']])

print(len(train_ds), len(test_ds))

1544 101


In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results_fold' ,          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    learning_rate=1e-05,             # learning rate
    weight_decay=0.0,                # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,               # How often to print logs
    save_strategy = "no",
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    eval_strategy="epoch",     # evalute after each epoch
    gradient_accumulation_steps=1,  # total number of steps before back propagation
    fp16=True,                       # Use mixed precision
    run_name="PDB_binary",      # experiment name
    seed=42,                         # Seed for experiment reproducibility
    load_best_model_at_end=False,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    use_cpu = True
    #remove_unused_columns=False
)

2024-11-11 14:02:41.021957: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-11 14:02:41.028496: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-11 14:02:41.035428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-11 14:02:41.037496: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-11 14:02:41.043273: I tensorflow/core/platform/cpu_feature_guar

In [13]:
from train_esm import MaskedRegressTrainer
from transformers import Trainer, TrainingArguments, EvalPrediction

from custom_model import *
from compute_metrics import ComputeMetrics


model = ESM2t30(pretrained_no = 1).cuda()

trainer = MaskedRegressTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator = lambda x: x[0] if len(x)==1 else x,
    compute_metrics = ComputeMetrics.text_regression,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Pearson R,Mse,R2 Score
1,0.1599,0.148248,0.280917,0.188549,-0.13799
2,0.1352,0.148446,0.275824,0.189982,-0.146637


TrainOutput(global_step=3088, training_loss=0.1548413087666961, metrics={'train_runtime': 2111.7323, 'train_samples_per_second': 1.462, 'train_steps_per_second': 1.462, 'total_flos': 0.0, 'train_loss': 0.1548413087666961, 'epoch': 2.0})

In [15]:
#save weights
torch.save(trainer.model.state_dict(), "./models/sema_1d_ESM2_t30.pth")