In [1]:
import torch
import esm
import pandas as pd

## 1. Load protein sequence data

In [2]:
df = pd.read_csv('protein_sequences.csv')
# 将 Uniprot 和 sequence 两列组合为列表
sequences = list(zip(df['target_id'], df['protein_sequence']))

## 2. Load ESM-2 model

In [3]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

ESM2(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bia

In [4]:
import numpy as np
import pandas as pd
import torch

# Define batch size and max sequence length
batch_size = 1
max_seq_len = 1024

# Number of sequences
num_sequences = len(sequences)

# Number of batches
num_batches = num_sequences // batch_size
if num_sequences % batch_size:
    num_batches += 1

# Initialize DataFrame to hold embeddings
embeddings_df = pd.DataFrame()

# Process each batch
for batch_idx in range(num_batches):
    print(f"Processing batch {batch_idx+1}/{num_batches}")
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_sequences)
    batch_sequences = sequences[start_idx:end_idx]

    sequence_representations = []
    sequence_labels = []

    for label, seq in batch_sequences:
        # Check the sequence length
        if len(seq) > max_seq_len:
            # Split the sequence into multiple sub-sequences
            num_subseqs = len(seq) // max_seq_len
            if len(seq) % max_seq_len:
                num_subseqs += 1
            subseqs = [(label, seq[i*max_seq_len:(i+1)*max_seq_len]) for i in range(num_subseqs)]
        else:
            subseqs = [(label, seq)]

        subseq_representations = []
        for subseq_label, subseq in subseqs:
            batch_labels, batch_strs, batch_tokens = batch_converter([(subseq_label, subseq)])
            batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

            # Extract per-residue representations (on CPU)
            with torch.no_grad():
                results = model(batch_tokens, repr_layers=[33], return_contacts=True)
            token_representations = results["representations"][33]

            # Generate per-sequence representations via averaging
            # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
            for i, tokens_len in enumerate(batch_lens):
                subseq_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0).numpy())

        # Average the sub-sequence representations to get the representation for the full sequence
        sequence_representations.append(np.mean(subseq_representations, axis=0))
        sequence_labels.append(label)

    # Create DataFrame for this batch
    batch_df = pd.DataFrame(sequence_representations, index=sequence_labels)

    # Append to overall DataFrame
    embeddings_df = pd.concat([embeddings_df, batch_df])

    # Write the current DataFrame to CSV
    embeddings_df.to_csv('protein_embeddings.csv', header=True)

    # Clear the sequence_representations list to save memory
    sequence_representations = []

print("Embeddings have been saved to protein_embeddings.csv")

Processing batch 1/4252
Processing batch 2/4252
Processing batch 3/4252
Processing batch 4/4252
Processing batch 5/4252
Processing batch 6/4252
Processing batch 7/4252
Processing batch 8/4252
Processing batch 9/4252
Processing batch 10/4252
Processing batch 11/4252
Processing batch 12/4252
Processing batch 13/4252
Processing batch 14/4252
Processing batch 15/4252
Processing batch 16/4252
Processing batch 17/4252
Processing batch 18/4252
Processing batch 19/4252
Processing batch 20/4252
Processing batch 21/4252
Processing batch 22/4252
Processing batch 23/4252
Processing batch 24/4252
Processing batch 25/4252
Processing batch 26/4252
Processing batch 27/4252
Processing batch 28/4252
Processing batch 29/4252
Processing batch 30/4252
Processing batch 31/4252
Processing batch 32/4252
Processing batch 33/4252
Processing batch 34/4252
Processing batch 35/4252
Processing batch 36/4252
Processing batch 37/4252
Processing batch 38/4252
Processing batch 39/4252
Processing batch 40/4252
Processin

In [9]:
import pandas as pd

# Load the two CSV files into pandas DataFrame
df1 = pd.read_csv('protein_embeddings1.csv', index_col=0)  # Use the first column as index
df2 = pd.read_csv('protein_embeddings.csv', index_col=0)  # Use the first column as index

# Update df1 using df2
df1.update(df2)

# Save the updated DataFrame to a new CSV file
df1.to_csv('updated_protein_embeddings.csv')

print("Updated embeddings have been saved to updated_protein_embeddings.csv")

Updated embeddings have been saved to updated_protein_embeddings.csv


In [5]:
import numpy as np
import pandas as pd

# Define batch size and max sequence length
batch_size = 1
max_seq_len = 1024

# Number of sequences
num_sequences = len(sequences)

# Number of batches
num_batches = num_sequences // batch_size
if num_sequences % batch_size:
    num_batches += 1

# Initialize DataFrame to hold embeddings
embeddings_df = pd.DataFrame()

# Process each batch
for batch_idx in range(num_batches):
    print(f"Processing batch {batch_idx+1}/{num_batches}")
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_sequences)
    batch_sequences = sequences[start_idx:end_idx]

    sequence_representations = []
    sequence_labels = []

    for label, seq in batch_sequences:
        # Check the sequence length
        if len(seq) > max_seq_len:
            # Generate a zero vector for sequences that are too long
            sequence_representations.append(np.zeros((1, 1280))[0])  # Generate a zero vector for sequences that are too long
            sequence_labels.append(label)
            continue  # skip to the next sequence

        batch_labels, batch_strs, batch_tokens = batch_converter([(label, seq)])
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        # Extract per-residue representations (on CPU)
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33], return_contacts=True)
        token_representations = results["representations"][33]

        # Generate per-sequence representations via averaging
        # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
        for i, tokens_len in enumerate(batch_lens):
            sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0).numpy())
        sequence_labels.append(label)

    # Create DataFrame for this batch
    batch_df = pd.DataFrame(sequence_representations, index=sequence_labels)

    # Append to overall DataFrame
    embeddings_df = pd.concat([embeddings_df, batch_df])

    # Write the current DataFrame to CSV
    embeddings_df.to_csv('protein_embeddings.csv', header=True)

    # Clear the sequence_representations list to save memory
    sequence_representations = []


Processing batch 1/2123
Processing batch 2/2123
Processing batch 3/2123
Processing batch 4/2123
Processing batch 5/2123
Processing batch 6/2123
Processing batch 7/2123
Processing batch 8/2123
Processing batch 9/2123
Processing batch 10/2123
Processing batch 11/2123
Processing batch 12/2123
Processing batch 13/2123
Processing batch 14/2123
Processing batch 15/2123
Processing batch 16/2123
Processing batch 17/2123
Processing batch 18/2123
Processing batch 19/2123
Processing batch 20/2123
Processing batch 21/2123
Processing batch 22/2123
Processing batch 23/2123
Processing batch 24/2123
Processing batch 25/2123
Processing batch 26/2123
Processing batch 27/2123
Processing batch 28/2123
Processing batch 29/2123
Processing batch 30/2123
Processing batch 31/2123
Processing batch 32/2123
Processing batch 33/2123
Processing batch 34/2123
Processing batch 35/2123
Processing batch 36/2123
Processing batch 37/2123
Processing batch 38/2123
Processing batch 39/2123
Processing batch 40/2123
Processin

In [10]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('protein_embeddings.csv')

# Find rows where all columns (except 'Uniprot') are 0
mask = np.all(df.iloc[:, 1:] == 0, axis=1)

# Extract these rows
zero_embedding_proteins = df.loc[mask]

print(zero_embedding_proteins)

     Uniprot    0    1    2    3    4    5    6    7    8  ...  1270  1271  \
13    P08581  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
21    P42345  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
30    P20930  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
38    P00533  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
39    O14746  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   
2089  O15440  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2090  P52732  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2101  P49815  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2113  Q9P2J5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2122  Q9Y2K7  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   

      1272  1273  1274  1275  1276  1277  1278  1279  
13     0

In [11]:
# Convert sequences to a dictionary for easy lookup
sequences_dict = dict(sequences)

# Get Uniprot IDs of proteins with zero embeddings
zero_embedding_uniprots = zero_embedding_proteins['Uniprot'].tolist()

# Get sequences of these proteins
zero_embedding_sequences = [(uniprot, sequences_dict[uniprot]) for uniprot in zero_embedding_uniprots]

print(zero_embedding_sequences)

[('P08581', 'MKAPAVLAPGILVLLFTLVQRSNGECKEALAKSEMNVNMKYQLPNFTAETPIQNVILHEHHIFLGATNYIYVLNEEDLQKVAEYKTGPVLEHPDCFPCQDCSSKANLSGGVWKDNINMALVVDTYYDDQLISCGSVNRGTCQRHVFPHNHTADIQSEVHCIFSPQIEEPSQCPDCVVSALGAKVLSSVKDRFINFFVGNTINSSYFPDHPLHSISVRRLKETKDGFMFLTDQSYIDVLPEFRDSYPIKYVHAFESNNFIYFLTVQRETLDAQTFHTRIIRFCSINSGLHSYMEMPLECILTEKRKKRSTKKEVFNILQAAYVSKPGAQLARQIGASLNDDILFGVFAQSKPDSAEPMDRSAMCAFPIKYVNDFFNKIVNKNNVRCLQHFYGPNHEHCFNRTLLRNSSGCEARRDEYRTEFTTALQRVDLFMGQFSEVLLTSISTFIKGDLTIANLGTSEGRFMQVVVSRSGPSTPHVNFLLDSHPVSPEVIVEHTLNQNGYTLVITGKKITKIPLNGLGCRHFQSCSQCLSAPPFVQCGWCHDKCVRSEECLSGTWTQQICLPAIYKVFPNSAPLEGGTRLTICGWDFGFRRNNKFDLKKTRVLLGNESCTLTLSESTMNTLKCTVGPAMNKHFNMSIIISNGHGTTQYSTFSYVDPVITSISPKYGPMAGGTLLTLTGNYLNSGNSRHISIGGKTCTLKSVSNSILECYTPAQTISTEFAVKLKIDLANRETSIFSYREDPIVYEIHPTKSFISGGSTITGVGKNLNSVSVPRMVINVHEAGRNFTVACQHRSNSEIICCTTPSLQQLNLQLPLKTKAFFMLDGILSKYFDLIYVHNPVFKPFEKPVMISMGNENVLEIKGNDIDPEAVKGEVLKVGNKSCENIHLHSEAVLCTVPNDLLKLNSELNIEWKQAISSTVLGKVIVQPDQNFTGLIAGVVSISTALLLLLGFFLWLKKRKQIKDLGSELVRYDARVHTPHLDRLVSAR

In [13]:
import pandas as pd

# Convert the list of tuples to a DataFrame
zero_embedding_sequences_df = pd.DataFrame(zero_embedding_sequences, columns=['Uniprot', 'Sequence'])

# Write the DataFrame to a CSV file
zero_embedding_sequences_df.to_csv('zero_embedding_sequences.csv', index=False)

In [22]:
import plotly.express as px
# Plot the histogram of sequence lengths
# These are the lncRNA sequences we can use to model
df = pd.read_csv('zero_embedding_sequences.csv')
sequence = df['Sequence'].dropna()
len(sequence[7])
px.histogram(sequence.map(len), log_y=True)

## 3. Obtain protein embeddings

In [4]:
import pandas as pd

# Define batch size
batch_size = 1

# Number of sequences
num_sequences = len(sequences)

# Number of batches
num_batches = num_sequences // batch_size
if num_sequences % batch_size:
    num_batches += 1

# Initialize DataFrame to hold embeddings
embeddings_df = pd.DataFrame()

# Process each batch
for batch_idx in range(num_batches):
    print(f"Processing batch {batch_idx+1}/{num_batches}")
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_sequences)
    batch_sequences = sequences[start_idx:end_idx]

    batch_labels, batch_strs, batch_tokens = batch_converter(batch_sequences)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33]

    # Generate per-sequence representations via averaging
    # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
    sequence_representations = []
    sequence_labels = []
    for i, (tokens_len, label) in enumerate(zip(batch_lens, batch_labels)):
        sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0).numpy())
        sequence_labels.append(label)

    # Create DataFrame for this batch
    batch_df = pd.DataFrame(sequence_representations, index=sequence_labels)

    # Append to overall DataFrame
    embeddings_df = pd.concat([embeddings_df, batch_df])

    # Write the current DataFrame to CSV
    embeddings_df.to_csv('protein_embeddings.csv', header=True)

    # Clear the sequence_representations list to save memory
    sequence_representations = []

Processing batch 1/2123
Processing batch 2/2123
Processing batch 3/2123
Processing batch 4/2123
Processing batch 5/2123
Processing batch 6/2123
Processing batch 7/2123
Processing batch 8/2123
Processing batch 9/2123
Processing batch 10/2123
Processing batch 11/2123
Processing batch 12/2123
Processing batch 13/2123
Processing batch 14/2123
Processing batch 15/2123
Processing batch 16/2123
Processing batch 17/2123
Processing batch 18/2123
Processing batch 19/2123
Processing batch 20/2123
Processing batch 21/2123
Processing batch 22/2123
