In [1]:
# %%
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import trange
import random
import math
import statistics 
import os

import torch
import torch.nn as nn
# torch.set_float32_matmul_precision("medium")

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, auc

from matplotlib import collections

import torch
from transformers import AutoModel, AutoTokenizer, EsmModel, AutoModelForMaskedLM


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using device: ", device)

seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic=True

# %%
# pretrain_name = "esm2_t6_8M_UR50D"
# pretrain_tcr_name = "facebook/esm2_t12_35M_UR50D"
# pretrain_name = "esm2_t30_150M_UR50D"

# pretrain_peptide_name = "ibm/MoLFormer-XL-both-10pct"

#%%
# python train.py --split_id 0 --epochs 10 --save False
# import argparse

# parser = argparse.ArgumentParser()
# parser.add_argument("--split_id", type = int, default = 0)
# parser.add_argument("--pretrain_name", type = str, default = "moleformer")
# parser.add_argument("--neg_generate_mode", type = str, default = "only-sampled-negs")

# parser.add_argument("--epochs", type = int, default = 10)
# parser.add_argument("--save", type = bool, default = True)
# parser.add_argument("--lr", type = float, default = 0.02)

# args = parser.parse_args()

from dotmap import DotMap

args = DotMap(
    {
        "split_id": 0,
        # "epochs": 10,
        # "save": True,
        # "lr": 0.02,
        "pretrain_tcr_name": "facebook/esm2_t12_35M_UR50D",
        # "pretrain_peptide_name": "SMILES_BERT",
        "pretrain_peptide_name": "moleformer",
        "neg_generate_mode": "only-neg-assays"
    }
)

#%%
if args.neg_generate_mode == "only-neg-assays":
    DATA_BASE = "TEINet-master/data/tcrgen/traingen/genval/1neg/"
elif args.neg_generate_mode == "only-sampled-negs":
    DATA_BASE = "TEINet-master/data/tcrgen/traingen/genval/2neg/"

EMEBEDS_BASE = "tc-hard/embeddings/TEIGen/"

train_df_path = os.path.join(DATA_BASE, args.neg_generate_mode, f"train-{args.split_id}.csv")
validation_df_path = os.path.join(DATA_BASE, args.neg_generate_mode, f"validation-{args.split_id}.csv")
test_df_path = os.path.join("tc-hard/dataset/new_split/pep+cdr3b/test", args.neg_generate_mode, f"test-{args.split_id}.csv")

# %%
from rdkit import Chem
from rdkit.Chem import AllChem
def amino_acid_to_smiles(sequence):
    molecule = Chem.MolFromSequence(sequence)
    smiles = Chem.MolToSmiles(molecule)
    return smiles

# %%
train_df = pd.read_csv(train_df_path)
validation_df = pd.read_csv(validation_df_path)
test_df = pd.read_csv(test_df_path)

using device:  cuda:0


In [3]:
train_df.shape, validation_df.shape, test_df.shape

((161187, 3), (43434, 3), (40480, 3))

In [4]:
train_df = train_df.rename(columns = {
    "CDR3.beta": "tcrb",
    "Epitope": "peptide",
    "Label": "label"
})

validation_df = validation_df.rename(columns = {
    "CDR3.beta": "tcrb",
    "Epitope": "peptide",
    "Label": "label"
})

In [5]:
# %%
tcr_tokenizer = AutoTokenizer.from_pretrained(args.pretrain_tcr_name)
tcr_embed_model = EsmModel.from_pretrained(args.pretrain_tcr_name)

if args.pretrain_peptide_name == "moleformer":
    peptide_tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)
    peptide_embed_model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)

elif args.pretrain_peptide_name == "SMILES_BERT":
    peptide_tokenizer = AutoTokenizer.from_pretrained("JuIm/SMILES_BERT")
    peptide_embed_model = AutoModelForMaskedLM.from_pretrained("JuIm/SMILES_BERT")

if torch.cuda.is_available():
    tcr_embed_model = tcr_embed_model.cuda()
    peptide_embed_model = peptide_embed_model.cuda()

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t12_35M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# %%
def get_representations(sequence, tokenizer, embed_model):
    sequence_representations = []
    for protein in sequence:
        inputs = tokenizer(protein, return_tensors="pt")
        inputs = inputs.to(device)
        
        outputs = embed_model(**inputs, output_hidden_states = True)

        last_hidden_states = outputs.last_hidden_state
        sequence_representation = last_hidden_states[0].mean(dim = 0)
        sequence_representations.append(sequence_representation)

    return torch.stack(sequence_representations, dim = 0)


def get_embeddings(df, save_path, feature, split_set, batch_size = 1000):
    feat_seq = df[feature]

    result = None

    if feature == "tcrb":
        tokenizer = tcr_tokenizer
        embed_model = tcr_embed_model
        for k in trange(feat_seq.shape[0] // batch_size + 1):
            embeddings = get_representations(feat_seq[k * batch_size: (k + 1) * batch_size], tokenizer, embed_model)
            embeddings = embeddings.detach().cpu().numpy()
            if k == 0:
                result = embeddings
            else:
                result = np.vstack([result, embeddings])
        
    elif feature == "peptide":
        tokenizer = peptide_tokenizer
        embed_model = peptide_embed_model
        
        peptide_uniq = list(feat_seq.unique())

        if args.pretrain_peptide_name == "SMILES_BERT":
            inputs = tokenizer(peptide_uniq, return_tensors="pt", truncation=True, padding=True)
            inputs = inputs.to(device)
            outputs = embed_model(**inputs, output_hidden_states = True)
            embeddings = outputs.hidden_states[0].mean(dim = 0)

        else:
            embeddings = get_representations(peptide_uniq, tokenizer, embed_model)

        embeddings = embeddings.detach().cpu().numpy()
        result = embeddings

    if split_set == "train":
        save_path = os.path.join(save_path, f"train-{args.split_id}." + feature + ".npy")
    elif split_set == "validation":
        save_path = os.path.join(save_path, f"validation-{args.split_id}." + feature + ".npy")
    elif split_set == "test":
        save_path = os.path.join(save_path, f"test-{args.split_id}." + feature + ".npy")
    else:
        RaiseError("split_set should be one of train, validation, test")

    np.save(save_path, result)
    print(f"embeddings saved to ", save_path)

In [None]:
# %%
save_path = os.path.join(EMEBEDS_BASE, args.pretrain_peptide_name, args.neg_generate_mode)

def embedding_step():
    get_embeddings(df = train_df, save_path = save_path, feature = "tcrb", split_set = "train")
    get_embeddings(df = train_df, save_path = save_path, feature = "peptide", split_set = "train")
    get_embeddings(df = validation_df, save_path = save_path, feature = "tcrb", split_set = "validation")
    get_embeddings(df = validation_df, save_path = save_path, feature = "peptide", split_set = "validation")
    get_embeddings(df = test_df, save_path = save_path, feature = "tcrb", split_set = "test")
    get_embeddings(df = test_df, save_path = save_path, feature = "peptide", split_set = "test")
   
    np.save(os.path.join(save_path, f"train-{args.split_id}.label.npy"), train_df["label"].values)
    np.save(os.path.join(save_path, f"validation-{args.split_id}.label.npy"), validation_df["label"].values)
    np.save(os.path.join(save_path, f"test-{args.split_id}.label.npy"), test_df["label"].values)

# %%
embedding_step()