## ProtBERT Model

Refer doi: 10.1109/TPAMI.2021.3095381 

Code Repo: https://huggingface.co/Rostlab/prot_bert

Required intallments:

!pip install transformers

!pip install torch

In [None]:
from transformers import BertModel, BertTokenizer
import re
import pandas as pd
import torch

In [None]:


# Load the ProtBERT model and tokenizer
model_name = "Rostlab/prot_bert"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
model = BertModel.from_pretrained(model_name)

# Load a CSV file containing protein sequences
csv_file = 'ExampleData.csv'  # Replace with your CSV file path
sequences_df = pd.read_csv(csv_file)

# Assuming the CSV has a column named 'sequence' with the protein sequences
sequences = sequences_df['Sequence'].tolist()

# Preprocess sequences: Replace non-standard amino acids with 'X'
processed_sequences = [re.sub(r'[UZOB]', 'X', seq) for seq in sequences]

# Generate embeddings for each sequence
embeddings = []
for sequence in processed_sequences:
    encoded_input = tokenizer(" ".join(sequence), return_tensors='pt')
    with torch.no_grad():
        output = model(**encoded_input)
        embeddings.append(output.last_hidden_state.mean(dim=1).squeeze().numpy())

# Create a DataFrame with the embeddings
embedding_df = pd.DataFrame(embeddings)

# Save the embeddings to a CSV file
embedding_df.to_csv('protein_embeddings.csv', index=False)

print('Protein embeddings saved to protein_embeddings.csv.')


In [None]:
encoded_df=pd.read_csv("protein_embeddings.csv")

In [None]:
encoded_df

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Example: Compare embeddings of wild-type and mutant sequences
wild_type_embedding = embeddings[0]  # Example wild-type
mutant_embedding = embeddings[2]     # Example mutant

similarity = cosine_similarity([wild_type_embedding], [mutant_embedding])
print(f"Cosine similarity between wild-type and mutant: {similarity[0][0]}")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:


# Calculate cosine similarity between the wild-type (first sequence) and all other sequences
wild_type_embedding = embeddings[0]
similarity_scores = cosine_similarity([wild_type_embedding], embeddings)[0]

# Exclude the self-similarity of the wild-type by masking the first index
masked_similarity_scores = np.copy(similarity_scores)
masked_similarity_scores[0] = np.nan

# Find the indices of the most and least similar sequences to the wild-type
most_similar_index = np.nanargmax(masked_similarity_scores)
least_similar_index = np.nanargmin(masked_similarity_scores)

# Get the similarity values
most_similar_value = similarity_scores[most_similar_index]
least_similar_value = similarity_scores[least_similar_index]

print(f"Most similar sequence to the wild-type: Index {most_similar_index} with similarity {most_similar_value}")
print(f"Least similar sequence to the wild-type: Index {least_similar_index} with similarity {least_similar_value}")
