In [1]:
import pandas as pd

df = pd.read_csv("prot_seq.csv")

In [2]:
# Using str.contains() with ~ operator to negate
df = df[~df["result"].str.contains("Error", na=False)].reset_index()
# protein_sequences = df[df["result"].str.len() < 10000]
protein_sequences = df
# Display basic information
print(f"Original number of rows: {len(df)}")
print(f"Number of rows after filtering: {len(protein_sequences)}")
print("\nFirst few rows of filtered data:")
print(protein_sequences.head())

Original number of rows: 16567
Number of rows after filtering: 16567

First few rows of filtered data:
   index       value                                             result
0      0  A0A024R2I8  MTPNSMTENGLTAWDKPKHCPDREHDWKLVGMSEACLHRKSHSERR...
1      1  A0A075B5G3  GVSDVPRDLEVVAATPTSLLISWPPPSHGYGYYRITYGETGGNSPV...
2      2  A0A075B6N1  MSNQVLCCVVLCLLGANTVDGGITQSPKYLFRKEGQNVTLSCEQNL...
3      3  A0A075B6T6  MKSLRVLLVILWLQLSWVWSQQKEVEQNSGPLSVPEGAIASLNCTY...
4      4  A0A087WZ82  MELVLVFLCSLLAPMVLASAAEKEKEMDPFHYDYQTLRIGGLVFAV...


In [3]:
# vals = list(df['result'])
# vals = [(len(x),i) for i,x in enumerate(vals)]
# print(sorted(vals)[-10:])
remove_indices = [12668, 12667]

In [6]:
import torch
from transformers import EsmModel, EsmTokenizer
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Load ESM-2 model
esm_model_name = "facebook/esm2_t33_650M_UR50D"
tokenizer = EsmTokenizer.from_pretrained(esm_model_name)
model = EsmModel.from_pretrained(esm_model_name)
model.to(device)
model.eval()

# Function to extract embeddings
def get_protein_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors="pt", truncation=True, padding="longest").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    torch.cuda.empty_cache()
    return outputs.last_hidden_state[:, 0, :].squeeze(0)

protein_embeddings = {}
for index, row in protein_sequences.iterrows():
    if index < 12660 or index in remove_indices:
        continue
    if index % 100 == 0:
        print(index)
    protein_id, sequence = row["value"], row["result"]
    protein_embeddings[protein_id] = get_protein_embedding(sequence)

# Convert to tensor matrix
embedding_dim = list(protein_embeddings.values())[0].shape[0]
protein_embedding_matrix = torch.stack(list(protein_embeddings.values()))

print("Embedding Matrix Shape:", protein_embedding_matrix.shape)  # Expected: (num_proteins, embedding_dim)

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
Embedding Matrix Shape: torch.Size([3905, 1280])


In [7]:
loaded_embeddings = torch.load("protein_embeddings1.pt")

  loaded_embeddings = torch.load("protein_embeddings1.pt")


In [8]:
loaded_embeddings.update(protein_embeddings)

In [9]:
torch.save(loaded_embeddings, 'protein_embeddings.pt')

In [10]:
temp = protein_embeddings
protein_embeddings = loaded_embeddings

In [11]:
# Ensure a consistent protein ID ordering
sorted_protein_ids = sorted(protein_embeddings.keys())  # Sort alphabetically or numerically

# Convert the embeddings dictionary into an ordered list
X_train = torch.stack([protein_embeddings[pid] for pid in sorted_protein_ids])

print("X_train shape:", X_train.shape)  # Should be (num_proteins, embedding_dim)


X_train shape: torch.Size([16565, 1280])


In [12]:
orig = pd.read_csv('HomoSapiens_binary_hq.txt',sep='\t')

In [13]:
import pandas as pd
import numpy as np
from scipy import sparse

proteins = sorted_protein_ids
# Create mapping of protein IDs to matrix indices
protein_to_idx = {protein: idx for idx, protein in enumerate(proteins)}

# Create empty sparse matrix in COO format
rows = []
cols = []
data = []

# Fill the sparse matrix (only for proteins in Uniprot_A)
for _, row in orig.iterrows():
    protein_a = row['Uniprot_A']
    protein_b = row['Uniprot_B']
    
    if protein_a not in protein_to_idx:
        continue
    # Only add edge if both proteins are in our protein list
    if protein_b in protein_to_idx:
        i = protein_to_idx[protein_a]
        j = protein_to_idx[protein_b]
        # Add both directions for undirected graph
        rows.extend([i, j])
        cols.extend([j, i])
        data.extend([1, 1])

# Convert to CSR format
adj_matrix = sparse.csr_matrix((data, (rows, cols)), 
                             shape=(len(proteins), len(proteins)))

# Display basic information
print(f"Number of unique proteins from Uniprot_A: {len(proteins)}")
print(f"Shape of adjacency matrix: {adj_matrix.shape}")
print(f"Number of non-zero elements: {adj_matrix.nnz}")
print(f"Sparsity: {adj_matrix.nnz/(len(proteins)**2)*100:.2f}%")

dense_subset = adj_matrix.toarray()
multi_label_targets = torch.tensor(dense_subset, dtype=torch.float32)
multi_label_targets.shape
# # Save in compressed sparse format
# sparse.save_npz('protein_adjacency_sparse.npz', adj_matrix)

# # Save the protein index mapping
# with open('protein_index_mapping.json', 'w') as f:
#     json.dump(protein_to_idx, f)

Number of unique proteins from Uniprot_A: 16565
Shape of adjacency matrix: (16565, 16565)
Number of non-zero elements: 290497
Sparsity: 0.11%


torch.Size([16565, 16565])

In [14]:
Y_train = multi_label_targets

In [17]:
X_train.shape, Y_train.shape

(torch.Size([16565, 1280]), torch.Size([16565, 16565]))

In [19]:
torch.save({"X_train": X_train_set, "Y_train": Y_train_set, "X_val": X_val_set, "Y_val": Y_val_set}, "ppi_dataset.pt")

In [6]:
import torch
# Load dataset
data = torch.load("ppi_dataset.pt")

X_train, Y_train = data["X_train"], data["Y_train"]
X_val, Y_val = data["X_val"], data["Y_val"]

Y_train = torch.clamp(Y_train.float(), min=0.0, max=1.0)
Y_val = torch.clamp(Y_val.float(), min=0.0, max=1.0)


print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)


  data = torch.load("ppi_dataset.pt")


X_train shape: torch.Size([13252, 1280])
Y_train shape: torch.Size([13252, 16565])


In [16]:
from sklearn.model_selection import train_test_split

# Split indices for training (80%) and validation (20%)
train_idx, val_idx = train_test_split(range(X_train.shape[0]), test_size=0.2, random_state=42)

# Create Training and Validation Sets
X_train_set = X_train[train_idx]
Y_train_set = Y_train[train_idx][:, :]  # Keep full interaction labels (against all proteins)

X_val_set = X_train[val_idx]
Y_val_set = Y_train[val_idx][:, :]  # Keep interactions against ALL proteins

print(f"Training Set: {X_train_set.shape}, {Y_train_set.shape}")
print(f"Validation Set: {X_val_set.shape}, {Y_val_set.shape}")


Training Set: torch.Size([13252, 1280]), torch.Size([13252, 16565])
Validation Set: torch.Size([3313, 1280]), torch.Size([3313, 16565])


In [26]:
import torch.nn as nn
import torch.optim as optim

class PPI_MultiLabel(nn.Module):
    def __init__(self, embedding_dim, num_proteins):
        super(PPI_MultiLabel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_proteins),  # Output size = total proteins (not just validation set)
            # nn.Sigmoid()  # Multi-label classification
        )

    def forward(self, protein_embedding):
        return self.fc(protein_embedding)  # Output shape: (batch_size, num_proteins)


In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Initialize model
embedding_dim = 1280  # Match X_train embedding size
num_proteins = Y_train.shape[1]  # Total number of proteins (train + val)
model = PPI_MultiLabel(embedding_dim, num_proteins)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
pos_weight = (Y_train == 0).sum() / (Y_train == 1).sum() / 10  # Ratio of negatives to positives

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight, device=device))

# Move data to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
X_train_set, Y_train_set = X_train.to(device), Y_train.to(device)
X_val_set, Y_val_set = X_val.to(device), Y_val.to(device)

# Training loop
epochs = 50
batch_size = 32

for epoch in range(epochs):
    model.train()
    total_loss = 0

    # Mini-batch training
    for i in range(0, X_train_set.shape[0], batch_size):
        batch_X = X_train_set[i:i+batch_size]
        batch_Y = Y_train_set[i:i+batch_size]

        optimizer.zero_grad()
        preds = model(batch_X)
        loss = loss_fn(preds, batch_Y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

    # **Validation Accuracy Check Every 5 Epochs**
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_set)  # Predict against all proteins
        val_preds_binary = (val_preds > 0.5).float()
        correct = (val_preds_binary == Y_val_set).sum().item()
        total = Y_val_set.numel()
        accuracy = correct / total


    val_preds_flat = val_preds_binary.cpu().numpy().flatten()
    Y_val_flat = Y_val_set.cpu().numpy().flatten()

    precision = precision_score(Y_val_flat, val_preds_flat)
    recall = recall_score(Y_val_flat, val_preds_flat)
    f1 = f1_score(Y_val_flat, val_preds_flat)

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


  loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight, device=device))


Epoch 1/50, Loss: 119.3602
Precision: 0.0210, Recall: 0.0502, F1 Score: 0.0296
Epoch 2/50, Loss: 110.1490
Precision: 0.0204, Recall: 0.0571, F1 Score: 0.0300
Epoch 3/50, Loss: 108.6736
Precision: 0.0204, Recall: 0.0555, F1 Score: 0.0299
Epoch 4/50, Loss: 107.6066
Precision: 0.0205, Recall: 0.0544, F1 Score: 0.0298
Epoch 5/50, Loss: 106.1633
Precision: 0.0207, Recall: 0.0553, F1 Score: 0.0301
Epoch 6/50, Loss: 106.1867
Precision: 0.0212, Recall: 0.0554, F1 Score: 0.0306
Epoch 7/50, Loss: 105.2753
Precision: 0.0211, Recall: 0.0542, F1 Score: 0.0304
Epoch 8/50, Loss: 104.5715
Precision: 0.0212, Recall: 0.0542, F1 Score: 0.0305
Epoch 9/50, Loss: 104.0647
Precision: 0.0216, Recall: 0.0537, F1 Score: 0.0308
Epoch 10/50, Loss: 103.7409
Precision: 0.0223, Recall: 0.0512, F1 Score: 0.0310
Epoch 11/50, Loss: 103.6442
Precision: 0.0226, Recall: 0.0505, F1 Score: 0.0312
Epoch 12/50, Loss: 104.0662
