In [None]:
import pandas as pd

df = pd.read_csv("ppi_dataset.csv")

In [None]:
import torch
from transformers import EsmModel, EsmTokenizer

model_name = "facebook/esm2_t12_35M_UR50D"  
tokenizer = EsmTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name)

# Function to generate embeddings for a protein sequence
def get_esm_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding

df["protein1_embedding"] = df["protein1_sequence"].apply(get_esm_embedding)
df["protein2_embedding"] = df["protein2_sequence"].apply(get_esm_embedding)

# Save the embeddings to a new file
df.to_csv("ppi_dataset_with_embeddings.csv", index=False)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset with embeddings
df = pd.read_csv("ppi_dataset_with_embeddings.csv")

# Convert embeddings to tensors
X = torch.stack([torch.tensor(eval(embedding)) for embedding in df["protein1_embedding"] + df["protein2_embedding"]])
y = torch.tensor(df["label"].values, dtype=torch.float32)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the FFNN model
class FFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# Initialize the model
input_dim = X.shape[1]  # Size of the ESM-2 embedding
hidden_dim = 128
output_dim = 1
model = FFNN(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train.unsqueeze(1))
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluate the model
model.eval()
with torch.no_grad():
    y_pred = model(X_test)
    y_pred = (y_pred > 0.5).float()  # Convert probabilities to binary predictions

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

torch.save(model.state_dict(), "ppi_prediction_model.pth")