In [27]:
import numpy as np
import pandas as pd
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import torch.multiprocessing as mp
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

import random
import math




In [None]:
dfm=pd.read_csv("/storage/homefs/ts18c034/MasterKinase/data/HumanKinomeSeqFamilyClassification_centered_20substrates.csv") #was 80 before

model_src = "/storage/homefs/ts18c034/MasterKinase/ESM-2/esm2_t12_35M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_src, from_tf=False)

In [29]:
class KinaseDataset(Dataset):
    def __init__(self, tokenized, labels, device):
        self.tokenized = tokenized
        self.labels = labels
        self.device = device

        # Convert labels to numerical format
        self.label_encoder = LabelEncoder()
        self.encoded_labels = torch.tensor(self.label_encoder.fit_transform(self.labels), dtype=torch.long)

    def __len__(self):
        return len(self.encoded_labels)

    def __getitem__(self, idx):
        tokenized_sample = {key: value[idx].to(self.device) for key, value in self.tokenized.items()}  
        label = self.encoded_labels[idx].to(self.device)

        return tokenized_sample, label

In [30]:
device = torch.device("cpu")

In [33]:
tokenized = tokenizer(dfm["Centered_Kinase_Sequence"].tolist(), return_tensors="pt",truncation=False) 

labels=dfm["HGNC Name "].tolist()

In [34]:
torch_model = AutoModel.from_pretrained(model_src, from_tf=False) #AutoModel already loads only the embedding layers and not the classifier head so we dont have to truncate the model.

torch_model.config.output_hidden_states = True

Some weights of EsmModel were not initialized from the model checkpoint at /storage/homefs/ts18c034/MasterKinase/ESM-2/esm2_t12_35M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
torch_model = torch_model.to(device) #"move" model to GPU if applicable


torch_model.eval()


all_embeddings = []
all_labels = []

In [36]:
dataset = KinaseDataset(tokenized, labels, device)
dataloader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=0)

In [37]:
with torch.no_grad():
    for batch in dataloader:
        tokenized_batch, labels_batch = batch  # Extract batch of tokenized sequences and labels
        
        # Pass batch through the model
        torch_output = torch_model(**tokenized_batch)

        #extract final layer
        final_layer_embeddings = torch_output.hidden_states[11]

        all_labels.append(labels_batch)
        all_embeddings.append(final_layer_embeddings)

In [38]:
torch.cat(all_embeddings, dim=0).shape

torch.Size([301, 1023, 480])

In [39]:
all_embeddings=torch.cat(all_embeddings,dim=0)
all_labels=torch.cat(all_labels,dim=0)

In [40]:
all_embeddings[0].shape

torch.Size([1023, 480])

In [None]:
torch.save(all_embeddings,"/storage/homefs/ts18c034/MasterKinase/ESM-2/kinasetokenized/301kinases_35M_centered")

In [None]:
torch.save(all_labels,"/storage/homefs/ts18c034/MasterKinase/ESM-2/kinasetokenized/301kinases_35M_centered")