In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Change into ProtClassify folder
import os
os.chdir('/content/drive/MyDrive/ProtClassify')
print("PWD:", os.getcwd())
# Should list all your notebooks, data/, combined_with_pfam.parquet, etc.
!ls -l

PWD: /content/drive/MyDrive/ProtClassify
total 63623
-rw------- 1 root root   213993 Apr 17 23:27 Analysis_Preprocess_Assignment8.ipynb
-rw------- 1 root root  3800496 Apr 21 21:16 combined_with_pfam.parquet
-rw------- 1 root root   190991 Apr 24 15:58 Competition_Tuning_Assignment12.ipynb
drwx------ 2 root root     4096 Apr 24 17:08 data
drwx------ 2 root root     4096 Apr 24 17:08 docs
drwx------ 2 root root     4096 Apr 24 17:08 ensemble_output
drwx------ 2 root root     4096 Apr 24 17:08 external_tools
-rw------- 1 root root   835872 Apr 21 06:38 feature_scores_combined.csv
drwx------ 2 root root     4096 Apr 24 17:08 feature_selector
-rw------- 1 root root      734 Apr 18 08:53 feature_selector_env.yaml
-rw------- 1 root root   619649 Mar  9 21:57 metadata_org.csv
-rw------- 1 root root  7191855 Apr  1 03:51 metadata_org_w_features.csv
-rw------- 1 root root  1912656 Apr 22 04:09 Model_Eval_Assignment10.ipynb
-rw------- 1 root root   716874 Apr 20 06:59 Model_Eval_Assignment9.ipyn

In [3]:
import torch
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
from pathlib import Path


In [4]:
# Configuration
train_csv = "metadata_org_w_features.csv"
test_csv = "testing_data_w_features.csv"
model_name = "Rostlab/prot_bert_bfd"
batch_size = 8

In [6]:
# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
model = BertModel.from_pretrained(model_name).to(device).eval()

Using device: cuda


In [7]:
# Preprocessing function (adds spaces between residues)
def preprocess(seqs):
    return [" ".join(list(s)) for s in seqs]

In [8]:
# Embedding function
def embed_protbert(seqs, batch_size=8):
    all_embeddings = []
    for i in range(0, len(seqs), batch_size):
        batch_seqs = preprocess(seqs[i:i+batch_size])
        tokens = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True)
        tokens = {k: v.to(device) for k, v in tokens.items()}

        with torch.no_grad():
            output = model(**tokens)
            last_hidden = output.last_hidden_state  # (B, L, 1024)

        mask = tokens["attention_mask"].unsqueeze(-1)
        pooled = (last_hidden * mask).sum(dim=1) / mask.sum(dim=1)
        all_embeddings.append(pooled.cpu().numpy())

    return np.vstack(all_embeddings)

In [9]:
# Load and embed training set
df_train = pd.read_csv(train_csv)
train_seqs = df_train["CleanSequence"].drop_duplicates().tolist()
print(f"Embedding {len(train_seqs)} training sequences with ProtBERT...")
protbert_train = embed_protbert(train_seqs, batch_size=batch_size)
np.save("protbert_train.npy", protbert_train)
print("Saved protbert_train.npy:", protbert_train.shape)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Embedding 679 training sequences with ProtBERT...
Saved protbert_train.npy: (679, 1024)


In [10]:
# Load and embed test set
df_test = pd.read_csv(test_csv)
test_seqs = df_test["CleanSequence"].drop_duplicates().tolist()
print(f"Embedding {len(test_seqs)} test sequences with ProtBERT...")
protbert_eval = embed_protbert(test_seqs, batch_size=batch_size)
np.save("protbert_eval.npy", protbert_eval)
print("Saved protbert_eval.npy:", protbert_eval.shape)

Embedding 170 test sequences with ProtBERT...
Saved protbert_eval.npy: (170, 1024)


In [11]:
from numpy.linalg import norm

# Load the embeddings
protbert_train = np.load("protbert_train.npy")
protbert_eval = np.load("protbert_eval.npy")

# Print shapes
print("Train shape:", protbert_train.shape)
print("Eval shape: ", protbert_eval.shape)

# Check uniqueness among first few rows
train_unique = len(set(map(tuple, protbert_train[:5])))
eval_unique = len(set(map(tuple, protbert_eval[:5])))
print("Unique rows (train, first 5):", train_unique)
print("Unique rows (eval, first 5): ", eval_unique)

# Print first 5×10 slice from training embeddings
print("\nFirst 5 training sequences (first 10 dims):")
print(protbert_train[:5, :10])

# Print L2 distance between row 0 and 1 to ensure non-trivial embeddings
print("\nTrain distance row 0–1:", norm(protbert_train[0] - protbert_train[1]))
print("Eval distance row 0–1: ", norm(protbert_eval[0] - protbert_eval[1]))

Train shape: (679, 1024)
Eval shape:  (170, 1024)
Unique rows (train, first 5): 5
Unique rows (eval, first 5):  5

First 5 training sequences (first 10 dims):
[[-0.00308354  0.01258058  0.00362734 -0.01812596  0.01308226  0.02219505
  -0.04585007 -0.04406652 -0.0002828  -0.01508848]
 [ 0.01189695  0.00605286  0.00143205  0.01612679  0.0310211   0.00919091
  -0.0135506  -0.03638414 -0.01489322 -0.0058881 ]
 [ 0.02996278  0.02738642 -0.01442665  0.00224002  0.00327666 -0.01447352
   0.00874932 -0.0274386  -0.01067536  0.03063629]
 [ 0.01004064  0.00632988 -0.02497391  0.00205142  0.0122145   0.00715791
  -0.01774822 -0.00409395  0.00128063  0.01881311]
 [ 0.00153544 -0.00827346 -0.00969256 -0.01578032  0.0480093   0.02024044
   0.01480491 -0.03880088  0.00665801 -0.00490303]]

Train distance row 0–1: 1.166804
Eval distance row 0–1:  1.51118
