In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import IsolationForest

import numpy as np

# from sklearn.svm import OneClassSVM


In [None]:
# Step 1: ProtBERT Tokenizer and Embedding Generation
# ----------------------------------------
# Load ProtBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")

# Example peptide sequences
peptides = ["AAGWDF", "VVKYPQ", "GKLSHF"]  # Replace with your actual peptides
formatted_peptides = [" ".join(list(seq)) for seq in peptides]

# Tokenize and get embeddings
inputs = tokenizer(formatted_peptides, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

# Get the last hidden state (embeddings)
sequence_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_dim)
sequence_embeddings_avg = sequence_embeddings.mean(dim=1).numpy()  # Shape: (batch_size, hidden_dim)

print(f"Shape of ProtBERT embeddings: {sequence_embeddings_avg.shape}")


In [None]:

# Step 2: Auxiliary Data
# ----------------------------------------
# Example auxiliary data (e.g., structural/functional info), set missing entries to 0
auxiliary_data = np.array([[0.1, 0.2, 0.3], [0.4, 0.6, 0.8], [0.5, 0.1, 0.0]])  # Shape: (batch_size, aux_dim)
auxiliary_data = torch.tensor(auxiliary_data, dtype=torch.float32)

# Step 3: One-Class random forest on ProtBERT Embeddings
# ----------------------------------------
# Train One-Class RF

# Assuming `sequence_embeddings_avg` contains embeddings of your positive peptides
# Train Isolation Forest on the positive embeddings
one_class_rf = IsolationForest(contamination=0.1)  # Set contamination as per your expected outliers
one_class_rf.fit(sequence_embeddings_avg)




# Predict bioactivity (SVM predicts +1 for inliers, -1 for outliers)
rf_predictions = one_class_rf.predict(sequence_embeddings_avg)
print(f"RF predictions: {rf_predictions}")  # 1 for likely bioactive, 0 for non-bioactive



In [None]:
# Step 4: N1-NN Neural Network for Bioactivity Prediction with Auxiliary Data
# ----------------------------------------
class N1NN_BioactivePeptideClassifier(nn.Module):
    def __init__(self, input_dim, aux_dim, rf_dim, hidden_dim):
        super(N1NN_BioactivePeptideClassifier, self).__init__()

        self.fc1_seq = nn.Linear(input_dim, hidden_dim)
        self.fc1_aux = nn.Linear(aux_dim, hidden_dim)
        self.fc1_rf = nn.Linear(rf_dim, hidden_dim)

        self.fc2 = nn.Linear(hidden_dim * 3, hidden_dim)  # Three inputs
        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, x_seq, x_aux=None, x_rf=None):
        # Peptide sequence path
        x_seq = torch.relu(self.fc1_seq(x_seq))

        # Auxiliary data path
        x_aux = torch.relu(self.fc1_aux(x_aux))

        # SVM features path
        x_rf = torch.relu(self.fc1_rf(x_rf))

        # Concatenate all features
        x_combined = torch.cat((x_seq, x_aux, x_rf), dim=1)

        # Fully connected layers for final prediction
        x_combined = torch.relu(self.fc2(x_combined))
        output = torch.sigmoid(self.output_layer(x_combined))
        return output

# Instantiate the N1-NN model
n1nn_model = N1NN_BioactivePeptideClassifier(input_dim=sequence_embeddings_avg.shape[1],
                                             aux_dim=auxiliary_data.shape[1],
                                             rf_dim=1,  # SVM prediction is a single feature
                                             hidden_dim=64)



In [None]:
# Step 5: Train N1-NN Model with ProtBERT Embeddings, Auxiliary Data, and SVM Features
# ----------------------------------------
criterion = nn.BCELoss()  # Binary cross-entropy for bioactivity prediction
optimizer = torch.optim.Adam(n1nn_model.parameters(), lr=0.001)

# Example training labels (since you have only bioactive peptides, use 1s for now)
labels = torch.tensor([1, 1, 1], dtype=torch.float32)  # Replace with your actual labels (if available)

# Training loop
for epoch in range(10):  # Increase epochs for more training
    optimizer.zero_grad()

    # Convert ProtBERT embeddings back to tensor
    inputs_tensor = torch.tensor(sequence_embeddings_avg, dtype=torch.float32)

    # Forward pass through N1-NN with embeddings, auxiliary data, and SVM features
    outputs = n1nn_model(inputs_tensor, auxiliary_data, rf_features)

    # Compute loss
    loss = criterion(outputs.squeeze(), labels)

    # Backpropagation and optimization
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


In [None]:

# Step 6: Inference and Prediction using SVM and N1-NN
# ----------------------------------------
def predict_bioactivity(peptide_sequence):
    # Tokenize and get embedding for the input peptide sequence
    formatted_peptide = " ".join(list(peptide_sequence))
    inputs = tokenizer([formatted_peptide], return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    # Get the average embedding for the sequence
    embedding_avg = outputs.last_hidden_state.mean(dim=1).numpy()

    # Step 1: Use One-Class SVM to determine if the peptide is bioactive
    rf_pred = one_class_rf.predict(embedding_avg)

    # Convert SVM prediction to tensor for input into N1-NN
    rf_feature = torch.tensor((svm_pred == 1).astype(np.float32)).unsqueeze(1)  # Shape: (1, 1)

    # Step 2: If the SVM classifies it as bioactive, pass through the N1-NN
    embedding_tensor = torch.tensor(embedding_avg, dtype=torch.float32)

    # Create auxiliary feature tensor (set to zero for inference)
    auxiliary_tensor = torch.zeros((1, auxiliary_data.shape[1]), dtype=torch.float32)  # Shape: (1, aux_dim)

    bioactivity_prob = n1nn_model(embedding_tensor, auxiliary_tensor, rf_feature).item()

    return f"Bioactive (Probability: {bioactivity_prob:.4f})"

# Example prediction on a new peptide sequence
new_peptide = "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGQKAVKDY"
bioactivity_prediction = predict_bioactivity(new_peptide)
print(f"Prediction for new peptide: {bioactivity_prediction}")

hello
