In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv("preprocessed_data3.csv")

# Ensure 'Severity' column exists
if 'Severity' not in df.columns:
    raise ValueError("The dataset must contain a 'Severity' column.")

# Extract features and target variable
X = df[['AminoBefore', 'AminoAfter']]
y = df['Severity']

# Load pretrained Protein BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")
model = AutoModel.from_pretrained("Rostlab/prot_bert")

# Function to get embeddings for a sequence
def get_embeddings(sequence):
    encoded_input = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True, max_length=218)
    with torch.no_grad():
        outputs = model(**encoded_input)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate embeddings for original and mutated sequences
def generate_embeddings(sequences):
    embeddings = []
    for i, seq in enumerate(sequences):
        if i % 100 == 0:
            print(f"Embedding sequence {i}: {seq}")
        embeddings.append(get_embeddings(seq))
    return np.array(embeddings)

# Convert sequences to lists of strings
X_sequences_before = X['AminoBefore'].astype(str).tolist()
X_sequences_after = X['AminoAfter'].astype(str).tolist()

# Generate embeddings
print("Generating embeddings for sequences...")
original_embeddings = generate_embeddings(X_sequences_before)
mutated_embeddings = generate_embeddings(X_sequences_after)
diff_embeddings = mutated_embeddings - original_embeddings

# Initialize MLP classifier
mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(100),
    max_iter=300,
    random_state=42)

# Initialize StratifiedKFold with shuffling
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(mlp_classifier, diff_embeddings, y, cv=skf, scoring='accuracy')

# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Fit the model on the entire data
mlp_classifier.fit(diff_embeddings, y)

# Split the data (for reporting purposes)
X_train, X_test, y_train, y_test = train_test_split(diff_embeddings, y, test_size=0.2, random_state=42)

# Predict on test data
y_pred_mlp = mlp_classifier.predict(X_test)

# Evaluate MLP classifier
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
mlp_classification_report = classification_report(y_test, y_pred_mlp)

print(f"MLP Accuracy on Test Data: {mlp_accuracy:.4f}")
print("MLP Classification Report on Test Data:")
print(mlp_classification_report)
