# Encoding protein sequences for machine learning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/Colab\ Notebooks/

/content/drive/MyDrive/Colab Notebooks


## Step 1: Load the dataset

We begin by loading a CSV file that contains protein sequences.

All sequences are of the same length — 6 amino acids.

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('pept_test.csv', delimiter=' ')
df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,ILE,ILE,ILE,SER,ASP,TYR,SER,LYS,GLU
1,ILE,ILE,LEU,SER,ASP,TYR,SER,LYS,GLU
2,ILE,ILE,VAL,SER,ASP,TYR,SER,LYS,GLU
3,ILE,LEU,ILE,SER,ASP,TYR,SER,LYS,GLU
4,ILE,LEU,LEU,LYS,ASP,PHE,GLU,GLU,ARG
...,...,...,...,...,...,...,...,...,...
56,VAL,ILE,VAL,SER,ASP,TYR,SER,LYS,GLU
57,VAL,LEU,ILE,SER,ASP,TYR,SER,LYS,GLU
58,VAL,LEU,VAL,SER,ASP,TYR,SER,LYS,GLU
59,VAL,VAL,LEU,SER,ASP,TYR,SER,LYS,GLU


In [None]:
# Dictionary to convert 3-letter amino acid codes to 1-letter codes
aa_3to1 = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
    'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
    'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}

# Empty list to store converted sequences
sequences = []

# Loop through each row in the DataFrame
for i in range(len(df)):
    row = df.iloc[i]       # Get the i-th row (a list of 3-letter amino acids)
    seq = ''
    for aa in row:
        seq += aa_3to1[aa]    # Convert each 3-letter code to 1-letter and add to the sequence
    sequences.append(seq)     # Add the final sequence to the list

print(sequences)
# Add the list of sequences as a new column in the DataFrame
df['Sequence'] = sequences
df

# More advanced option: df.apply(lambda row: ''.join(aa_3to1[aa] for aa in row), axis=1)

['IIISDYSKE', 'IILSDYSKE', 'IIVSDYSKE', 'ILISDYSKE', 'ILLKDFEER', 'ILLKDFRKD', 'ILLKDFRKR', 'ILLKDHSDK', 'ILLKDHSED', 'ILLKDHTDR', 'ILLKDHTRK', 'ILLKDIDTD', 'ILLKDIEKE', 'ILLKDISER', 'ILLKDITEK', 'ILLKDLKED', 'ILLKDLKRK', 'ILLKDLKTD', 'ILLKDLSDK', 'ILLKDLSKK', 'ILLKDLTED', 'ILLKDLTEK', 'ILLKDLTKD', 'ILLKDLTKE', 'ILLKDLTKR', 'ILLKDLTRR', 'ILLKDNDDK', 'ILLKDNETD', 'ILLKDNKTR', 'ILLKDQDKR', 'ILLKDQESK', 'ILLKDQETE', 'ILLKDVERR', 'ILLKDYDEE', 'ILLKDYDSD', 'ILLKDYDTE', 'ILLKDYKEK', 'ILLKDYRSK', 'ILLKDYRTR', 'ILLKDYSEK', 'ILLKDYSKD', 'ILLKDYSKE', 'ILLKDYSRD', 'ILLKDYSRE', 'ILLKDYTRK', 'ILLSDYSKE', 'IVISDYSKE', 'LIISDYSKE', 'LILSDYSKE', 'LIVSDYSKE', 'LLISDYSKE', 'LLLSDYSKE', 'LLVSDYSKE', 'LVVSDYSKE', 'VIISDYSKE', 'VILSDYSKE', 'VIVSDYSKE', 'VLISDYSKE', 'VLVSDYSKE', 'VVLSDYSKE', 'VVVSDYSKE']


Unnamed: 0,1,2,3,4,5,6,7,8,9,Sequence
0,ILE,ILE,ILE,SER,ASP,TYR,SER,LYS,GLU,IIISDYSKE
1,ILE,ILE,LEU,SER,ASP,TYR,SER,LYS,GLU,IILSDYSKE
2,ILE,ILE,VAL,SER,ASP,TYR,SER,LYS,GLU,IIVSDYSKE
3,ILE,LEU,ILE,SER,ASP,TYR,SER,LYS,GLU,ILISDYSKE
4,ILE,LEU,LEU,LYS,ASP,PHE,GLU,GLU,ARG,ILLKDFEER
...,...,...,...,...,...,...,...,...,...,...
56,VAL,ILE,VAL,SER,ASP,TYR,SER,LYS,GLU,VIVSDYSKE
57,VAL,LEU,ILE,SER,ASP,TYR,SER,LYS,GLU,VLISDYSKE
58,VAL,LEU,VAL,SER,ASP,TYR,SER,LYS,GLU,VLVSDYSKE
59,VAL,VAL,LEU,SER,ASP,TYR,SER,LYS,GLU,VVLSDYSKE


## Step 2: One-hot Encoding

We will try two different approaches for converting protein sequences into one-hot encoded matrices:

- Manual implementation (using Python and NumPy)
- Using a library (sklearn.preprocessing.OneHotEncoder)

Each amino acid will be represented by a binary vector of length 20, where only one position is "1" and the rest are "0".

### Method 1: Manual One-hot encoding

In [None]:
# List of all 20 standard amino acids
amino_acids = ["A", "C", "D", "E", "F", "G", "H", "I",
               "K", "L", "M", "N", "P", "Q", "R", "S",
               "T", "V", "W", "Y"]

# Create a dictionary: amino acid → index
aa_to_index = {}
for i in range(len(amino_acids)):
    aa = amino_acids[i]
    aa_to_index[aa] = i  # For example: aa_to_index["A"] = 0
aa_to_index

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19}

In [None]:
# One-hot encoding function
def one_hot_encode(seq):
    # Create a zero matrix with shape (sequence length, 20)
    one_hot = np.zeros((len(seq), len(amino_acids)), dtype=int)

    # TODO: Fill the matrix: set 1 at the appropriate index for each amino acid
    for i, aa in enumerate(seq):
        if aa in aa_to_index:
            # Hint: one_hot[row, column]
            # Hint: use aa_to_index to find the correct column index
            one_hot[...] = 1
    return one_hot

# Apply one-hot encoding to all sequences in the dataset
one_hot_manual = []
# Loop through each sequence in the dataset
for seq in sequences:
    encoded = one_hot_encode(seq)  # Apply the encoding function
    one_hot_manual.append(encoded)  # Add the result to the list

print(f"Original sequence: {sequences[0]}")
print(f"Encoded sequence shape: {one_hot_manual[0].shape}")
print("One-hot encoding:\n", one_hot_manual[0])

Original sequence: DYDTDA
Encoded sequence shape: (6, 20)
One-hot encoding:
 [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


### Method 2: Using `OneHotEncoder` from scikit-learn

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categories=[amino_acids], handle_unknown='ignore', sparse_output=False)

one_hot_sklearn = []
for seq in sequences:
    # Split sequence into list of single characters and reshape
    aa_list = np.array(list(seq)).reshape(-1, 1)
    one_hot = encoder.fit_transform(aa_list)
    one_hot_sklearn.append(one_hot)

print(f"Encoded sequence shape: {one_hot_sklearn[0].shape}")
print(f"Original sequence: {sequences[0]}")
print("One-hot encoding:\n", one_hot_sklearn[0])

Encoded sequence shape: (6, 20)
Original sequence: DYDTDA
One-hot encoding:
 [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


Most traditional machine learning models, such as Logistic Regression, Support Vector Machines (SVM), or Random Forests, expect 1D feature vectors as input — not 2D matrices.

However, after applying encodings like one-hot, VHSE8, or BLOSUM, each protein sequence is represented as a 2D matrix of shape.
To feed this into a machine learning model, we need to flatten this matrix into a single vector:

In [None]:
print("Flattened shape:", one_hot_sklearn[0].flatten().shape)
one_hot_sklearn[0].flatten()

Flattened shape: (120,)


array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [None]:
# Flatten each one-hot matrix into a vector
one_hot_flattened = [mat.flatten() for mat in one_hot_sklearn]

In [None]:
# Create a DataFrame from the list of flattened vectors
df_onehot = pd.DataFrame(one_hot_flattened)

# Optionally, add the original sequence for reference
df_onehot['Sequence'] = sequences

# Save to CSV
df_onehot.to_csv("onehot_flattened.csv", index=False)

## Step 3: VHSE8 Encoding

The VHSE8 encoding represents each amino acid as an 8-dimensional vector based on **principal components** of physicochemical properties (e.g., hydrophobicity, charge, steric effects).  
These descriptors capture biologically meaningful similarities between amino acids and are often used in classical ML models.

Since there is no convenient and standard library in Python that performs VHSE8 encoding, we implement it ourselves.

Steps:
1. Create a dictionary of VHSE8 vectors for standard amino acids.
2. Encode each sequence into an `L × 8` matrix.

In [None]:
# Define VHSE8 encoding table
vhse8_table = {
    'A': [0.06, 0.32, -0.66, -0.22, 0.07, 0.09, -0.06, -0.12],
    'C': [0.01, 0.84, 0.79, -0.23, -0.12, -0.22, -0.14, -0.06],
    'D': [0.64, -0.02, 0.07, 0.67, -0.19, -0.20, -0.16, -0.25],
    'E': [0.46, -0.03, -0.04, 0.73, -0.05, -0.19, -0.15, -0.19],
    'F': [-0.55, -0.20, 1.25, -0.13, 0.08, 0.22, 0.12, -0.01],
    'G': [0.28, 0.26, -0.89, 0.07, -0.06, -0.01, -0.09, -0.17],
    'H': [0.12, 0.10, 0.23, 0.39, 0.09, 0.14, 0.07, 0.32],
    'I': [-0.59, -0.32, 0.31, -0.27, 0.13, 0.10, -0.02, -0.12],
    'K': [0.23, -0.03, -0.18, 1.11, 0.01, -0.06, 0.05, 0.07],
    'L': [-0.55, -0.26, 0.30, -0.23, 0.14, 0.12, -0.03, -0.10],
    'M': [-0.34, 0.00, 0.33, -0.06, 0.07, 0.09, 0.03, -0.08],
    'N': [0.50, 0.00, -0.05, 0.49, -0.16, -0.08, -0.10, -0.16],
    'P': [-0.01, 0.38, -0.27, -0.27, 0.07, 0.05, -0.04, -0.23],
    'Q': [0.31, -0.03, -0.01, 0.55, -0.08, -0.06, -0.07, -0.12],
    'R': [0.25, 0.00, -0.03, 1.10, -0.06, 0.01, 0.07, 0.23],
    'S': [0.21, 0.37, -0.48, 0.11, -0.07, -0.02, -0.10, -0.08],
    'T': [0.08, 0.24, -0.30, 0.01, 0.00, 0.00, -0.07, -0.06],
    'V': [-0.46, -0.18, 0.10, -0.24, 0.09, 0.06, -0.02, -0.08],
    'W': [-0.33, -0.13, 1.29, 0.08, 0.04, 0.27, 0.17, 0.27],
    'Y': [-0.25, -0.06, 0.86, 0.13, 0.05, 0.24, 0.16, 0.10]
}


In [None]:
vhse_encoded = []  # Create an empty list to store encoded sequences

# Go through each sequence one by one
for seq in sequences:
    vecs = []  # This will store the VHSE8 vectors for the current sequence

    # Go through each amino acid in the sequence
    for aa in seq:
        vector = vhse8_table[aa]  # Get the VHSE8 vector for this amino acid
        vecs.append(vector)       # Add the vector to the list for this sequence

    # Convert the list of vectors to a NumPy array and save it
    vhse_encoded.append(np.array(vecs))

print(f"Example VHSE8 shape: {vhse_encoded[0].shape}")
print(f"Original sequence: {sequences[0]}")
print("VHSE8 encoding:\n", vhse_encoded[0])

Example VHSE8 shape: (6, 8)
Original sequence: DYDTDA
VHSE8 encoding:
 [[ 0.64 -0.02  0.07  0.67 -0.19 -0.2  -0.16 -0.25]
 [-0.25 -0.06  0.86  0.13  0.05  0.24  0.16  0.1 ]
 [ 0.64 -0.02  0.07  0.67 -0.19 -0.2  -0.16 -0.25]
 [ 0.08  0.24 -0.3   0.01  0.    0.   -0.07 -0.06]
 [ 0.64 -0.02  0.07  0.67 -0.19 -0.2  -0.16 -0.25]
 [ 0.06  0.32 -0.66 -0.22  0.07  0.09 -0.06 -0.12]]


In [None]:
# Flatten each one-hot matrix into a vector
vhse_flattened = [mat.flatten() for mat in vhse_encoded]

# Create a DataFrame from the list of flattened vectors
df_vhse = pd.DataFrame(vhse_flattened)

# Optionally, add the original sequence for reference
df_vhse['Sequence'] = sequences

# Save to CSV
df_vhse.to_csv("vhse_flattened.csv", index=False)

## Step 4: BLOSUM Encoding

BLOSUM62 (Blocks Substitution Matrix) captures how frequently one amino acid substitutes for another in conserved blocks of protein sequences.  
Each amino acid is represented as a **20-dimensional vector** corresponding to substitution scores with the other 19 amino acids + itself.

Steps:
1. Create a BLOSUM62 matrix dictionary.
2. Map each sequence to a matrix of shape `L × 20`.

In [None]:
# List of amino acids in the order corresponding to rows and columns of the matrix
aa_order = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
            'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

# BLOSUM62 matrix: each row corresponds to an amino acid in aa_order
# Each number represents the substitution score between the row amino acid and
# the column amino acid (following the same aa_order)
blosum62_matrix = [
    [ 4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1, -2, -1,  1,  0, -3, -2,  0],  # A
    [-1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1, -3, -2, -1, -1, -3, -2, -3],  # R
    [-2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2, -3, -2,  1,  0, -4, -2, -3],  # N
    [-2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3, -3, -1,  0, -1, -4, -3, -3],  # D
    [ 0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],  # C
    [-1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0, -3, -1,  0, -1, -2, -1, -2],  # Q
    [-1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2, -3, -1,  0, -1, -3, -2, -2],  # E
    [ 0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3, -3, -2,  0, -2, -2, -3, -3],  # G
    [-2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2, -1, -2, -1, -2, -2,  2, -3],  # H
    [-1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,  0, -3, -2, -1, -3, -1,  3],  # I
    [-1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,  0, -3, -2, -1, -2, -1,  1],  # L
    [-1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1, -3, -1,  0, -1, -3, -2, -2],  # K
    [-1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,  0, -2, -1, -1, -1, -1,  1],  # M
    [-2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,  6, -4, -2, -2,  1,  3, -1],  # F
    [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4,  7, -1, -1, -4, -3, -2],  # P
    [ 1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1, -2, -1,  4,  1, -3, -2, -2],  # S
    [ 0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  1,  5, -2, -2,  0],  # T
    [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,  1, -4, -3, -2, 11,  2, -3],  # W
    [-2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,  3, -3, -2, -2,  2,  7, -1],  # Y
    [ 0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1, -1, -2, -2,  0, -3, -1,  4],  # V
]


In [None]:
# Create a dictionary for quick access by amino acid
blosum62_dict = {}

# Loop through each amino acid and its index in the aa_order list
for i, aa in enumerate(aa_order):
    # Assign the corresponding row from the matrix to the dictionary key (amino acid)
    blosum62_dict[aa] = blosum62_matrix[i]

for i in range(20):
    print(aa_order[i],blosum62_dict[aa_order[i]])

A [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0]
R [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3]
N [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3]
D [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3]
C [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1]
Q [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2]
E [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2]
G [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3]
H [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3]
I [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3]
L [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1]
K [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2]
M [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1]

In [None]:
# Get the substitution scores row for amino acid 'A'
row = blosum62_dict['A']
print("A row:", row)

# Find the index of amino acid 'R' in the aa_order list
index_of_R = aa_order.index('R')
print("Index of R:", index_of_R)

# Retrieve the substitution score for substituting 'A' with 'R'
score = row[index_of_R]
print("Score for A->R:", score)

A row: [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0]
Index of R: 1
Score for A->R: -1


In [None]:
blosum_encoded = []  # This will store the encoded sequences as arrays

for seq in sequences:
    vecs = []  # Temporary list to store vectors for each amino acid in the sequence

    # Loop through each amino acid in the sequence
    for aa in seq:
        # Get the BLOSUM62 vector (score row) for this amino acid
        vector = blosum62_dict[aa]
        # Add this vector to the temporary list
        vecs.append(vector)

    # Convert the list of vectors to a NumPy array and append to the encoded list
    blosum_encoded.append(np.array(vecs))

print(f"Example BLOSUM62 shape: {blosum_encoded[0].shape}")
print(f"Original sequence: {sequences[0]}")
print("BLOSUM62 encoding:\n", blosum_encoded[0])

Example BLOSUM62 shape: (9, 20)
Original sequence: IIISDYSKE
BLOSUM62 encoding:
 [[-1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3]
 [-1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3]
 [-1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3]
 [ 1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2]
 [-2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3]
 [-2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1]
 [ 1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2]
 [-1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2]
 [-1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2]]


In [None]:
# Flatten each one-hot matrix into a vector
blosum_flattened = [mat.flatten() for mat in blosum_encoded]

# Create a DataFrame from the list of flattened vectors
df_blosum = pd.DataFrame(blosum_flattened)

# Optionally, add the original sequence for reference
df_blosum['Sequence'] = sequences

# Save to CSV
df_blosum.to_csv("blosum_flattened.csv", index=False)

## Step 5: ProtVec Embedding

In [None]:
# Load the ProtVec file (each row: 3-mer + 100 numbers)
protvec_df = pd.read_csv('protVec_100d_3grams_clean.csv', delimiter='\t', header=None)

# Create a dictionary: {3-mer (e.g., 'AAA') : 100-dimensional vector}
protvec_dict = {}

for i, row in protvec_df.iterrows():
    kmer = row[0]  # First column: the 3-mer string
    vector = row[1:].to_numpy(dtype=float)  # Remaining columns: vector values
    protvec_dict[kmer] = vector  # Add to dictionary

# Get vector size (should be 100)
protvec_dim = len(vector)
print(protvec_dim)

100


In [None]:
# --- Function to convert sequence into overlapping 3-mers (trimers) ---
def get_trimers(seq):
    trimers = []
    for i in range(len(seq) - 2):
        trimer = seq[i:i+3]
        trimers.append(trimer)
    return trimers

# --- Encode all sequences using ProtVec ---
protvec_encoded = []
# Iterate by sequence
for seq in sequences:
    trimers = get_trimers(seq) # Convert sequence into 3-mers
    print(seq, trimers)
    vec_list = []

    for tri in trimers:
        vec = protvec_dict[tri] # Get vector for this 3-mer
        vec_list.append(vec)  # Add to list of vectors

    # Convert list of vectors to a 2D NumPy array and save
    protvec_encoded.append(np.array(vec_list))

# --- Example output ---
print(f"Original sequence: {sequences[0]}")
print(f"ProtVec shape: {protvec_encoded[0].shape}")  # (length - 2, 100)
print("ProtVec encoding:\n", protvec_encoded[0])

DYDTDA ['DYD', 'YDT', 'DTD', 'TDA']
DYSGSA ['DYS', 'YSG', 'SGS', 'GSA']
DYSGDA ['DYS', 'YSG', 'SGD', 'GDA']
DYDGSA ['DYD', 'YDG', 'DGS', 'GSA']
DYSGDA ['DYS', 'YSG', 'SGD', 'GDA']
DYTADA ['DYT', 'YTA', 'TAD', 'ADA']
DYDYDA ['DYD', 'YDY', 'DYD', 'YDA']
DYTNTA ['DYT', 'YTN', 'TNT', 'NTA']
DYTGTA ['DYT', 'YTG', 'TGT', 'GTA']
DYDRSA ['DYD', 'YDR', 'DRS', 'RSA']
Original sequence: DYDTDA
ProtVec shape: (4, 100)
ProtVec encoding:
 [[-1.58384e-01  1.24239e-01  2.00550e-01 -8.87380e-02 -4.39530e-02
  -7.71500e-02  1.30011e-01  1.68733e-01 -1.02707e-01  6.51920e-02
  -2.16720e-02 -5.14540e-02  7.70190e-02 -8.48320e-02  2.95370e-02
   8.59940e-02  2.87633e-01 -6.81140e-02  1.37510e-01 -1.17242e-01
  -2.06480e-02  1.39288e-01 -1.76245e-01  1.65955e-01  2.04870e-01
   1.15679e-01  6.29910e-02 -7.73680e-02  3.29450e-02  1.33326e-01
   9.67980e-02 -7.34580e-02  2.03078e-01 -1.53608e-01 -7.79490e-02
  -4.88610e-02 -3.99820e-02  3.87539e-01  1.09483e-01  7.71050e-02
   5.65310e-02  2.75263e-01  3.8271

In [None]:
# Sum vectors for each sequence instead of flattening
protvec_summed = [embedding.sum(axis=0) for embedding in protvec_encoded]

# Create DataFrame
df_protvec = pd.DataFrame(protvec_summed)
df_protvec['Sequence'] = sequences

df_protvec.to_csv("protvec_summed.csv", index=False)


## Step 6: ESM-2 Embedding

In [None]:
! pip install fair-esm

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0


| Model Name               | Layers (transformers) | Parameters | Embedding Dim | Notes                        |
|--------------------------|------------|------------|----------------|------------------------------|
| `esm2_t6_8M_UR50D`       | 6          | 8M         | 320            | Smallest (very fast)         |
| `esm2_t12_35M_UR50D`     | 12         | 35M        | 480            | Small/medium                 |
| `esm2_t30_150M_UR50D`    | 30         | 150M       | 640            | Medium                       |
| `esm2_t33_650M_UR50D`    | 33         | 650M       | 1280           | Large                        |
| `esm2_t36_3B_UR50D`      | 36         | 3B         | 2560           | Very large (slow, high RAM) |
| `esm2_t48_15B_UR50D`     | 48         | 15B        | 5120           | Massive                      |


In [None]:
import torch
import esm

# Load a small pre-trained ESM-2 model (6 layers, 320-d embeddings)
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
model.eval()  # Set the model to evaluation mode (no training)

# This function helps convert sequences into model-friendly format
batch_converter = alphabet.get_batch_converter()

# Example: if we have a list of sequences named `sequences`
# We create a list of tuples: (sequence ID, sequence string)
data = []
for i, seq in enumerate(sequences):
    data.append((f"seq{i}", seq))
print(data, "\n")

# Convert the data into tensors the model can understand
batch_labels, batch_strs, batch_tokens = batch_converter(data)


# ---------------------------------------------------------------
print("Batch labels:", batch_labels, "\n")
print("Batch strings:", batch_strs, "\n")
token_to_index = {tok: i for i, tok in enumerate(alphabet.all_toks)}
print("Alphabet:", token_to_index, "\n")
print("Batch tokens:\n", batch_tokens)

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t6_8M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t6_8M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t6_8M_UR50D-contact-regression.pt


[('seq0', 'DYDTDA'), ('seq1', 'DYSGSA'), ('seq2', 'DYSGDA'), ('seq3', 'DYDGSA'), ('seq4', 'DYSGDA'), ('seq5', 'DYTADA'), ('seq6', 'DYDYDA'), ('seq7', 'DYTNTA'), ('seq8', 'DYTGTA'), ('seq9', 'DYDRSA')] 

Batch labels: ['seq0', 'seq1', 'seq2', 'seq3', 'seq4', 'seq5', 'seq6', 'seq7', 'seq8', 'seq9'] 

Batch strings: ['DYDTDA', 'DYSGSA', 'DYSGDA', 'DYDGSA', 'DYSGDA', 'DYTADA', 'DYDYDA', 'DYTNTA', 'DYTGTA', 'DYDRSA'] 

Alphabet: {'<cls>': 0, '<pad>': 1, '<eos>': 2, '<unk>': 3, 'L': 4, 'A': 5, 'G': 6, 'V': 7, 'S': 8, 'E': 9, 'R': 10, 'T': 11, 'I': 12, 'D': 13, 'P': 14, 'K': 15, 'Q': 16, 'N': 17, 'F': 18, 'Y': 19, 'M': 20, 'H': 21, 'W': 22, 'C': 23, 'X': 24, 'B': 25, 'U': 26, 'Z': 27, 'O': 28, '.': 29, '-': 30, '<null_1>': 31, '<mask>': 32} 

Batch tokens:
 tensor([[ 0, 13, 19, 13, 11, 13,  5,  2],
        [ 0, 13, 19,  8,  6,  8,  5,  2],
        [ 0, 13, 19,  8,  6, 13,  5,  2],
        [ 0, 13, 19, 13,  6,  8,  5,  2],
        [ 0, 13, 19,  8,  6, 13,  5,  2],
        [ 0, 13, 19, 11,  5, 

In [None]:
# Turn off gradient calculations – we are just making predictions
with torch.no_grad():
    # Get the model's output for each token in the sequence
    results = model(batch_tokens, repr_layers=[6], return_contacts=False)

# Extract the embeddings from layer 6 (this gives a tensor of shape [batch, seq_len, 320])
embeddings = results["representations"][6]
print(f"Embedding shape for first sequence: {embeddings[0].shape}")

# Average across the sequence length dimension to get a fixed-size vector per sequence
sequence_representations = embeddings.mean(dim=1)  # Shape: (batch_size, 320)
print(f"Sequence embedding shape (for batch - 10 seq): {sequence_representations.shape}")
sequence_representations

Embedding shape for first sequence: torch.Size([8, 320])
Sequence embedding shape (for batch - 10 seq): torch.Size([10, 320])


tensor([[ 0.0902, -0.0035,  0.3300,  ...,  0.2311,  0.0929, -0.2184],
        [ 0.0851, -0.1014,  0.1768,  ...,  0.3412,  0.1333, -0.1632],
        [ 0.1078, -0.0421,  0.2863,  ...,  0.3613,  0.0653, -0.2482],
        ...,
        [ 0.1078, -0.0874,  0.2605,  ...,  0.2599,  0.1684, -0.1297],
        [ 0.1645, -0.0259,  0.2483,  ...,  0.2858,  0.1580, -0.1891],
        [ 0.0779, -0.1248,  0.2838,  ...,  0.3294,  0.1245, -0.1402]])

In [None]:
# Convert to NumPy array so we can save to a CSV file
sequence_representations_np = sequence_representations.cpu().numpy()

# Create a DataFrame with the embeddings
df_embeddings = pd.DataFrame(sequence_representations_np)

# Extract actual sequences from your original data list
sequences = [seq for _, seq in data]

# Add the sequences as a column
df_embeddings['Sequence'] = sequences

# Move the Sequence column to the front
columns = df_embeddings.columns.tolist()
columns = columns[-1:] + columns[:-1]  # move 'Sequence' column to the front
df_embeddings = df_embeddings[columns]

# Save the embeddings to a CSV file
df_embeddings.to_csv('esm2_320d_sequence_embeddings.csv', index=False)

print("Saved 320-dimensional ESM-2 embeddings to 'esm2_320d_sequence_embeddings.csv'")

Saved 320-dimensional ESM-2 embeddings to 'esm2_320d_sequence_embeddings.csv'
