In [None]:
# OPTIONAL: If using google colab, mount google drive for saving/loading files and pip install the required packages
from google.colab import drive
drive.mount('/content/drive')
!pip install datasets
!pip install transformers[torch]

In [None]:
# Specify the paths to LazBF_sequences.csv, LazBF_sample.csv, LazDEF_sequences.csv, LazDEF_sample.csv
# By default, these files are located in /LazBFDEF/data/
LazBF_sequences_path = '../data/LazBF_sequences.csv'
LazBF_sample_path = '../data/LazBF_sample.csv'
LazDEF_sequences_path = '../data/LazDEF_sequences.csv'
LazDEF_sample_path = '../data/LazDEF_sample.csv'

In [None]:
# Trains LazDEF-ESM through MLM on LazDEF substrate data
# Extracts LazBF/DEF embeddings from LazDEF-ESM
# Embeddings will be saved as .npy files in /LazBFDEF/embeddings/

# Alternatively, if one does not wish to retrain LazDEF-ESM, one can 
# skip this cell and proceed to the next in order to load the pret-
# rained LazDEF-ESM and extract the LazBF/DEF embeddings from the 
# pretrained model

# Imports
import torch
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from sklearn.metrics import ndcg_score
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, normalize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics import ndcg_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, SVC
from sklearn.linear_model import Ridge
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from datasets import Dataset
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load sequences from csv
df = pd.read_csv(LazBF_sequences_path)
LazBF_sequences = df['sequences'].tolist()
LazBF_labels = df['labels'].tolist()

df = pd.read_csv(LazBF_sample_path)
LazBF_sample = df['sequences'].tolist()
LazBF_sample_labels = df['labels'].tolist()

df = pd.read_csv(LazDEF_sequences_path)
LazDEF_sequences = df['sequences'].tolist()
LazDEF_labels = df['labels'].tolist()

df = pd.read_csv(LazDEF_sample_path)
LazDEF_sample = df['sequences'].tolist()
LazDEF_sample_labels = df['labels'].tolist()

# Define model, tokenizer, and MLM data set object
model = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
train = Dataset.from_dict(tokenizer(LazDEF_sequences)).shuffle(seed=42)
test = Dataset.from_dict(tokenizer(LazDEF_sample))

args = TrainingArguments(
    output_dir='../Models/LazDEF_ESM',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-6,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    gradient_accumulation_steps=2,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
# Do MLM for one epoch on LazDEF MLM data set
trainer.train()

# Returns mean embedding from LazBF-ESM
def get_mean_rep(sequence):
  token_ids = tokenizer(sequence, return_tensors='pt').to(device)
  with torch.no_grad():
      results = model(token_ids.input_ids, output_hidden_states=True)
  representations = results.hidden_states[-1][0]
  mean_embedding = representations.mean(dim=0)
  return mean_embedding.cpu().numpy()

model.eval()

# Get LazBF embeddings from LazDEF-ESM
LazBF_embs = []
for seq in tqdm(LazBF_sample):
  LazBF_embs.append(get_mean_rep(seq))
LazBF_embs = np.array(LazBF_embs)
np.save('../Embeddings/LazBF_mlm_LazDEF.npy', LazBF_embs)

# Get LazDEF embeddings from LazDEF-ESM
LazDEF_embs = []
for seq in tqdm(LazDEF_sample):
  LazDEF_embs.append(get_mean_rep(seq))
LazDEF_embs = np.array(LazDEF_embs)
np.save('../Embeddings/LazDEF_mlm_LazDEF.npy', LazDEF_embs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
0,2.3908,2.380886


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight'].
100%|██████████| 50000/50000 [41:55<00:00, 19.88it/s]
100%|██████████| 50000/50000 [41:48<00:00, 19.94it/s]


In [None]:
# Loads the pretrained LazDEF-ESM model
# Extracts LazBF/DEF embeddings from LazDEF-ESM
# Embeddings will be saved as .npy files in /LazBFDEF/embeddings/

# Imports
import torch
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from sklearn.metrics import ndcg_score
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, normalize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics import ndcg_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, SVC
from sklearn.linear_model import Ridge
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from datasets import Dataset
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load sequences from csv
df = pd.read_csv(LazBF_sequences_path)
LazBF_sequences = df['sequences'].tolist()
LazBF_labels = df['labels'].tolist()

df = pd.read_csv(LazBF_sample_path)
LazBF_sample = df['sequences'].tolist()
LazBF_sample_labels = df['labels'].tolist()

df = pd.read_csv(LazDEF_sequences_path)
LazDEF_sequences = df['sequences'].tolist()
LazDEF_labels = df['labels'].tolist()

df = pd.read_csv(LazDEF_sample_path)
LazDEF_sample = df['sequences'].tolist()
LazDEF_sample_labels = df['labels'].tolist()

# Define model, tokenizer, and MLM data set object
model = AutoModelForMaskedLM.from_pretrained("ShuklaGroupIllinois/LazDEF_ESM2").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")

# Returns mean embedding from LazBF-ESM
def get_mean_rep(sequence):
  token_ids = tokenizer(sequence, return_tensors='pt').to(device)
  with torch.no_grad():
      results = model(token_ids.input_ids, output_hidden_states=True)
  representations = results.hidden_states[-1][0]
  mean_embedding = representations.mean(dim=0)
  return mean_embedding.cpu().numpy()

model.eval()

# Get LazBF embeddings from LazDEF-ESM
LazBF_embs = []
for seq in tqdm(LazBF_sample):
  LazBF_embs.append(get_mean_rep(seq))
LazBF_embs = np.array(LazBF_embs)
np.save('../Embeddings/LazBF_mlm_LazDEF.npy', LazBF_embs)

# Get LazDEF embeddings from LazDEF-ESM
LazDEF_embs = []
for seq in tqdm(LazDEF_sample):
  LazDEF_embs.append(get_mean_rep(seq))
LazDEF_embs = np.array(LazDEF_embs)
np.save('../Embeddings/LazDEF_mlm_LazDEF.npy', LazDEF_embs)