In [15]:
import torch
import esm
import joblib
import numpy as np
from pathlib import Path

# Load ESM model once
esm_model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()
esm_model.eval()

# Load fitted MinMaxScaler
scaler_path = Path("saved_models/esm_scaler.pkl")
scaler = joblib.load(scaler_path)


def compute_esm(seq: str):
    """
    Compute normalized ESM embedding for a single sequence.
    """
    formatted_input = [("seq1", seq)]
    batch_labels, batch_strs, batch_tokens = batch_converter(formatted_input)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    with torch.no_grad():
        results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False)

    token_representations = results["representations"][6]
    rep = token_representations[0, 1 : batch_lens[0] - 1].mean(0)
    rep_np = rep.numpy().reshape(1, -1)  # reshape for scaler

    rep_scaled = scaler.transform(rep_np)  # normalize
    return rep_scaled[0].tolist()

In [16]:
a = compute_esm("IVY")

In [18]:
# change a into pandas dataframe
import pandas as pd
df = pd.DataFrame([a], columns=[f"esm_{i}" for i in range(len(a))])

In [19]:
df

Unnamed: 0,esm_0,esm_1,esm_2,esm_3,esm_4,esm_5,esm_6,esm_7,esm_8,esm_9,...,esm_310,esm_311,esm_312,esm_313,esm_314,esm_315,esm_316,esm_317,esm_318,esm_319
0,0.665039,0.635418,0.562497,0.50322,0.571498,0.479303,0.4677,0.336468,0.447972,0.344916,...,0.652191,0.231072,0.705864,0.297982,0.680502,0.408164,0.335687,0.179551,0.78728,0.593365


In [20]:
import pandas as pd
import numpy as np
import re
import math

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesRegressor
from sklearn.model_selection import LeaveOneOut, cross_validate, cross_val_score, StratifiedShuffleSplit, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.metrics import confusion_matrix
import math
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier  # Import Decision Tree
from sklearn.model_selection import GridSearchCV, LeaveOneOut  # Model selection tools


import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [21]:
def esm_embeddings(peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you conputer might automatically kill the job.
  import torch
  import esm
  import collections
  # load the model
  # NOTICE: if the model was not downloaded in your local environment, it will automatically download it.
  model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
  batch_converter = alphabet.get_batch_converter()
  model.eval()  # disables dropout for deterministic results

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t6_8M_UR50D' only has 6 layers, and therefore repr_layers parameters is equal to 6
      results = model(batch_tokens, repr_layers=[6], return_contacts=True)  
  token_representations = results["representations"][6]

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  return embeddings_results

In [36]:
peptide_sequence_list = []
for seq in ["IVY"]:
    format_seq = [seq,seq] # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple(format_seq)
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information

In [37]:
peptide_sequence_list

[('IVY', 'IVY')]

In [38]:
embeddings_results = esm_embeddings(peptide_sequence_list)

In [39]:
embeddings_results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,310,311,312,313,314,315,316,317,318,319
0,0.086059,-0.258805,0.391053,0.326945,0.011611,-0.061046,-0.268043,-0.203712,-0.078116,-0.179269,...,0.209114,-0.141731,0.192254,-0.111785,0.11312,0.026103,0.011826,-0.032042,0.387591,0.001597


In [40]:
embeddings_results = scaler.transform(embeddings_results) # normalize X to 0-1 range 

In [41]:
df.columns = range(df.shape[1])

In [42]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,310,311,312,313,314,315,316,317,318,319
0,0.665039,0.635418,0.562497,0.50322,0.571498,0.479303,0.4677,0.336468,0.447972,0.344916,...,0.652191,0.231072,0.705864,0.297982,0.680502,0.408164,0.335687,0.179551,0.78728,0.593365


In [48]:
embeddings_results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,310,311,312,313,314,315,316,317,318,319
0,0.665039,0.635418,0.562497,0.50322,0.571498,0.479303,0.4677,0.336468,0.447972,0.344916,...,0.652191,0.231072,0.705864,0.297982,0.680502,0.408164,0.335687,0.179551,0.78728,0.593365


In [45]:
import pandas as pd

embeddings_results = pd.DataFrame(embeddings_results)

In [47]:
import pandas as pd

# Assume df1 and df2 are your two DataFrames
first_row_df1 = df.iloc[0]
first_row_df2 = embeddings_results.iloc[0]

# Element-wise comparison
comparison = first_row_df1 == first_row_df2

# Check if all elements match
all_equal = comparison.all()

print("Are the first rows equal?", all_equal)

Are the first rows equal? False
