In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Change into ProtClassify folder
import os
os.chdir('/content/drive/MyDrive/ProtClassify')
print("PWD:", os.getcwd())
# Should list all your notebooks, data/, combined_with_pfam.parquet, etc.
!ls -l

PWD: /content/drive/MyDrive/ProtClassify
total 63623
-rw------- 1 root root   213993 Apr 17 23:27 Analysis_Preprocess_Assignment8.ipynb
-rw------- 1 root root  3800496 Apr 21 21:16 combined_with_pfam.parquet
-rw------- 1 root root   190991 Apr 24 15:58 Competition_Tuning_Assignment12.ipynb
drwx------ 2 root root     4096 Apr 24 17:05 data
drwx------ 2 root root     4096 Apr 24 17:05 docs
drwx------ 2 root root     4096 Apr 24 17:05 ensemble_output
drwx------ 2 root root     4096 Apr 24 17:05 external_tools
-rw------- 1 root root   835872 Apr 21 06:38 feature_scores_combined.csv
drwx------ 2 root root     4096 Apr 24 17:05 feature_selector
-rw------- 1 root root      734 Apr 18 08:53 feature_selector_env.yaml
-rw------- 1 root root   619649 Mar  9 21:57 metadata_org.csv
-rw------- 1 root root  7191855 Apr  1 03:51 metadata_org_w_features.csv
-rw------- 1 root root  1912656 Apr 22 04:09 Model_Eval_Assignment10.ipynb
-rw------- 1 root root   716874 Apr 20 06:59 Model_Eval_Assignment9.ipyn

In [4]:
!pip install -q fair-esm

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# ─── ESM2 Embedding Pipeline ─────────────────────────────────────────────
import esm
import pandas as pd
import numpy as np
import torch
from pathlib import Path

# Step 1: Load and clean sequences
csv_path = "metadata_org_w_features.csv"
output_file = "esm2_train.npy"

df = pd.read_csv(csv_path)
if "CleanSequence" not in df.columns:
    raise ValueError("Missing 'CleanSequence' column in CSV")

df = df.drop_duplicates(subset="CleanSequence").reset_index(drop=True)
sequences = df["CleanSequence"].tolist()

print(f"Loaded {len(sequences)} unique sequences")

# Step 2: Load ESM2 model and converter
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model = model.to(device).eval()
batch_converter = alphabet.get_batch_converter()

# Step 3: Embedding function
def embed_esm2(seqs, batch_size=1):
    all_embeddings = []
    for i in range(0, len(seqs), batch_size):
        batch = [(str(j), seqs[j]) for j in range(i, min(i+batch_size, len(seqs)))]
        labels, strs, toks = batch_converter(batch)
        toks = toks.to(device)

        with torch.no_grad():
            result = model(toks, repr_layers=[33], return_contacts=False)
        token_reps = result["representations"][33]
        lengths = (toks != alphabet.padding_idx).sum(dim=1)

        # Mean pool over tokens (excluding padding, start/end)
        for j, length in enumerate(lengths):
            vec = token_reps[j, 1:length-1].mean(0).cpu().numpy()
            all_embeddings.append(vec)
        torch.cuda.empty_cache()
    return np.stack(all_embeddings)

# Step 4: Run embeddings
print("Embedding sequences with ESM2...")
embeddings = embed_esm2(sequences, batch_size=1)

# Step 5: Verify output
if len(set(map(tuple, embeddings[:5]))) == 1:
    raise ValueError("Embeddings appear to be identical — investigate!")

print("Embedding shape:", embeddings.shape)
np.save(output_file, embeddings)
print(f"Saved to {output_file}")


Loaded 679 unique sequences
Using device: cuda


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


Embedding sequences with ESM2...
Embedding shape: (679, 1280)
Saved to esm2_train.npy


In [6]:
esm2_train = np.load("esm2_train.npy")
print("Shape:", esm2_train.shape)

Shape: (679, 1280)


In [7]:
unique_rows = len(set(map(tuple, esm2_train[:5])))
print("Unique rows among first 5:", unique_rows)


Unique rows among first 5: 5


In [8]:
print("First 5 sequences (first 10 dims):")
print(esm2_train[:5, :10])


First 5 sequences (first 10 dims):
[[ 0.02434669 -0.05104493 -0.0653983   0.1614329  -0.08906472 -0.0185376
   0.11717623  0.00490847 -0.02523768  0.11615728]
 [-0.03797659 -0.11156673 -0.02147814 -0.00271799  0.03247521 -0.04840764
   0.07605141 -0.07524779 -0.05486202  0.04725526]
 [ 0.10650491 -0.07205261  0.003668    0.0389449  -0.03563187 -0.10197577
   0.01765189 -0.08673552  0.0266648   0.05346079]
 [ 0.02212254 -0.03688725 -0.00731855  0.00818602 -0.03290962 -0.06052278
   0.09439764  0.03983423  0.01361133  0.08567628]
 [ 0.01983873  0.02294585  0.04421835  0.10042888 -0.06179839 -0.0911576
   0.07863367  0.0614925   0.0382542   0.07216165]]


In [9]:
from numpy.linalg import norm

dist = norm(esm2_train[0] - esm2_train[1])
print("L2 distance between row 0 and 1:", dist)


L2 distance between row 0 and 1: 2.9229147


In [10]:
# Step 1: Load the test CSV
test_csv = "testing_data_w_features.csv"
output_file = "esm2_eval.npy"

df_test = pd.read_csv(test_csv)
if "CleanSequence" not in df_test.columns:
    raise ValueError("Missing 'CleanSequence' column in test set.")

df_test = df_test.drop_duplicates(subset="CleanSequence").reset_index(drop=True)
test_sequences = df_test["CleanSequence"].tolist()
print(f"Loaded {len(test_sequences)} unique test sequences")

# Step 2: Load ESM2 model and converter
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model = model.to(device).eval()
batch_converter = alphabet.get_batch_converter()

# Step 3: Embedding function
def embed_esm2(seqs, batch_size=1):
    all_embeddings = []
    for i in range(0, len(seqs), batch_size):
        batch = [(str(j), seqs[j]) for j in range(i, min(i+batch_size, len(seqs)))]
        labels, strs, toks = batch_converter(batch)
        toks = toks.to(device)

        with torch.no_grad():
            result = model(toks, repr_layers=[33], return_contacts=False)
        token_reps = result["representations"][33]
        lengths = (toks != alphabet.padding_idx).sum(dim=1)

        for j, length in enumerate(lengths):
            vec = token_reps[j, 1:length-1].mean(0).cpu().numpy()
            all_embeddings.append(vec)
        torch.cuda.empty_cache()
    return np.stack(all_embeddings)

# Step 4: Run and save
print("Embedding test sequences with ESM2...")
esm2_eval = embed_esm2(test_sequences, batch_size=1)

print("Embedding shape:", esm2_eval.shape)
np.save(output_file, esm2_eval)
print(f"Saved to {output_file}")

Loaded 170 unique test sequences
Using device: cuda
Embedding test sequences with ESM2...
Embedding shape: (170, 1280)
Saved to esm2_eval.npy


In [12]:
# Load ESM2 evaluation embeddings
esm2_eval = np.load("esm2_eval.npy")

# Basic checks
shape = esm2_eval.shape
unique_rows = len(set(map(tuple, esm2_eval[:5])))

# Show slice
sample_df = pd.DataFrame(esm2_eval[:5, :10], columns=[f"dim_{i}" for i in range(10)])


print(sample_df)
print("Shape:", shape)
print("Unique rows among first 5:", unique_rows)

      dim_0     dim_1     dim_2     dim_3     dim_4     dim_5     dim_6  \
0 -0.055286 -0.069871 -0.017458  0.062318 -0.057167 -0.097157  0.046300   
1 -0.031671 -0.013571 -0.046220  0.021139 -0.049918 -0.059739  0.042705   
2 -0.011763 -0.025749 -0.020235  0.151089 -0.050390 -0.115374  0.093277   
3  0.020814 -0.041365 -0.012924  0.012874  0.031504 -0.058197  0.086905   
4 -0.005824 -0.108517 -0.067089  0.153602 -0.095592  0.014355  0.191420   

      dim_7     dim_8     dim_9  
0 -0.142198 -0.004714 -0.025495  
1 -0.110206 -0.051680  0.067385  
2 -0.000147  0.016088  0.155641  
3  0.103312  0.050065  0.095398  
4  0.004750  0.014283  0.197335  
Shape: (170, 1280)
Unique rows among first 5: 5
