In [1]:
from tape.datasets import RemoteHomologyDataset
import numpy as np
import pandas as pd

In [2]:
data_path = 'D:\\University\\Semester_8\\CO549 Computational Bioengineering\\Project\\data_dir'

train_data = RemoteHomologyDataset(data_path=data_path, split='train')
valid_data = RemoteHomologyDataset(data_path=data_path, split='valid')
test_data_fold = RemoteHomologyDataset(data_path=data_path, split='test_fold_holdout')
test_data_family = RemoteHomologyDataset(data_path=data_path, split='test_family_holdout')
test_data_superfamily = RemoteHomologyDataset(data_path=data_path, split='test_superfamily_holdout')


In [3]:
from prot2vec import get_protvec_embedding
from trigrams import get_bag_of_trigrams_reduced
from alphared import reduce_alphabet

In [4]:
X_trigrams = []
X_prot2vec = []
y = []

# Reduce the alphabet of the sequences in the dataset
for i in range(len(train_data)):
    item = train_data.data[i]
    seq = item['primary']
    label = item['fold_label']
    reduced_seq = reduce_alphabet(seq,7)  # Reduce the alphabet of the sequence

    X_trigrams.append(get_bag_of_trigrams_reduced(reduced_seq, alphabet_size=7))
    X_prot2vec.append(get_protvec_embedding(reduced_seq))
    y.append(label)

X_trigrams = np.array(X_trigrams)
X_prot2vec = np.array(X_prot2vec)
y = np.array(y)


In [5]:

X_trigrams_valid = []
X_prot2vec_valid = []
y_valid = []

for i in range(len(valid_data)):
    item = valid_data.data[i]
    seq = item['primary']
    label = item['fold_label']
    
    reduced_seq = reduce_alphabet(seq, 7)  # Reduce the alphabet of the sequence
    X_trigrams_valid.append(get_bag_of_trigrams_reduced(reduced_seq, alphabet_size=7))
    X_prot2vec_valid.append(get_protvec_embedding(reduced_seq))
    y_valid.append(label)

X_trigrams_valid = np.array(X_trigrams_valid)
X_prot2vec_valid = np.array(X_prot2vec_valid)
y_valid = np.array(y_valid)


In [6]:
X_trigrams_test = []
X_prot2vec_test = []
y_test = []

for i in range(len(test_data_fold)):
    sample = test_data_fold.data[i]
    seq = sample['primary']
    label = sample['fold_label']
    reduced_seq = reduce_alphabet(seq, 7)  # Reduce the alphabet of the sequence
    
    X_trigrams_test.append(get_bag_of_trigrams_reduced(reduced_seq, alphabet_size=7))
    X_prot2vec_test.append(get_protvec_embedding(reduced_seq))
    y_test.append(label)

X_trigrams_test = np.array(X_trigrams_test)
X_prot2vec_test = np.array(X_prot2vec_test)
y_test = np.array(y_test)


In [7]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score


In [8]:
clf_tri = SVC(kernel='rbf', C=1.0)
clf_tri.fit(X_trigrams, y)

# Evaluate on validation set
y_pred_valid = clf_tri.predict(X_trigrams_valid)
acc_valid = accuracy_score(y_valid, y_pred_valid)
f1_valid = f1_score(y_valid, y_pred_valid, average='macro')
print("Bag-of-trigrams - Validation Accuracy:", acc_valid*100, "%")
print("Bag-of-trigrams - Validation Macro F1:", f1_valid*100, "%")

# Evaluate on test set
y_pred_test = clf_tri.predict(X_trigrams_test)
acc_test = accuracy_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test, average='macro')
print("Bag-of-trigrams - Test Accuracy:", acc_test*100, "%")
print("Bag-of-trigrams - Test Macro F1:", f1_test*100, "%")


Bag-of-trigrams - Validation Accuracy: 8.423913043478262 %
Bag-of-trigrams - Validation Macro F1: 0.9814702243681707 %
Bag-of-trigrams - Test Accuracy: 12.256267409470752 %
Bag-of-trigrams - Test Macro F1: 1.9685077877218098 %


In [9]:
clf_prot = SVC(kernel='rbf', C=1.0)
clf_prot.fit(X_prot2vec, y)

# Evaluate on validation set
y_pred_valid = clf_prot.predict(X_prot2vec_valid)
acc_valid = accuracy_score(y_valid, y_pred_valid)
f1_valid = f1_score(y_valid, y_pred_valid, average='macro')
print("Prot2Vec - Validation Accuracy:", acc_valid)
print("Prot2Vec - Validation Macro F1:", f1_valid)

# Evaluate on test set
y_pred_test = clf_prot.predict(X_prot2vec_test)
acc_test = accuracy_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test, average='macro')
print("Prot2Vec - Test Accuracy:", acc_test)
print("Prot2Vec - Test Macro F1:", f1_test)


Prot2Vec - Validation Accuracy: 0.020380434782608696
Prot2Vec - Validation Macro F1: 0.00012522488301908844
Prot2Vec - Test Accuracy: 0.027855153203342618
Prot2Vec - Test Macro F1: 0.00039853339709867687


In [10]:
# Extract features
X_trigrams_test_family = []
X_prot2vec_test_family = []
y_test_family = []

for i in range(len(test_data_family)):
    sample = test_data_family.data[i]
    seq = sample['primary']
    label = sample['fold_label']

    reduced_seq = reduce_alphabet(seq, 7)  # Reduce the alphabet of the sequence
    X_trigrams_test_family.append(get_bag_of_trigrams_reduced(reduced_seq, alphabet_size=7))
    X_prot2vec_test_family.append(get_protvec_embedding(seq))
    y_test_family.append(label)

X_trigrams_test_family = np.array(X_trigrams_test_family)
X_prot2vec_test_family = np.array(X_prot2vec_test_family)
y_test_family = np.array(y_test_family)



In [11]:
# Family level classification

clf_tri_family = SVC(kernel='rbf', C=1.0)
clf_tri_family.fit(X_trigrams, y)

y_pred_test_family = clf_tri_family.predict(X_trigrams_test_family)

acc_test_family = accuracy_score(y_test_family, y_pred_test_family)
f1_test_family = f1_score(y_test_family, y_pred_test_family, average='macro')

print("Bag-of-trigrams - Family Holdout Accuracy:", acc_test_family * 100, "%")
print("Bag-of-trigrams - Family Holdout Macro F1:", f1_test_family * 100, "%")

clf_prot_family = SVC(kernel='rbf', C=1.0)

clf_prot_family.fit(X_prot2vec, y)
y_pred_test_family = clf_prot_family.predict(X_prot2vec_test_family)
acc_test_family = accuracy_score(y_test_family, y_pred_test_family)
f1_test_family = f1_score(y_test_family, y_pred_test_family, average='macro')
print("Prot2Vec - Family Holdout Accuracy:", acc_test_family * 100, "%")
print("Prot2Vec - Family Holdout Macro F1:", f1_test_family * 100, "%")

Bag-of-trigrams - Family Holdout Accuracy: 49.056603773584904 %
Bag-of-trigrams - Family Holdout Macro F1: 25.155237917195233 %
Prot2Vec - Family Holdout Accuracy: 14.229559748427672 %
Prot2Vec - Family Holdout Macro F1: 0.131819952879829 %


In [12]:
# Extract features
X_trigrams_test_superfamily = []
X_prot2vec_test_superfamily = []
y_test_superfamily = []

for i in range(len(test_data_superfamily)):
    sample = test_data_superfamily.data[i]
    seq = sample['primary']
    label = sample['fold_label']

    reduced_seq = reduce_alphabet(seq, 7)  # Reduce the alphabet of the sequence
    X_trigrams_test_superfamily.append(get_bag_of_trigrams_reduced(reduced_seq, alphabet_size=7))
    X_prot2vec_test_superfamily.append(get_protvec_embedding(seq))
    y_test_superfamily.append(label)

X_trigrams_test_superfamily = np.array(X_trigrams_test_superfamily)
X_prot2vec_test_superfamily = np.array(X_prot2vec_test_superfamily)
y_test_superfamily = np.array(y_test_superfamily)



In [13]:

# 3. Train same model on normal train split
clf_tri_superfamily = SVC(kernel='rbf', C=1.0)
clf_tri_superfamily.fit(X_trigrams, y)

# 4. Evaluate on superfamily holdout
y_pred_test_superfamily = clf_tri_superfamily.predict(X_trigrams_test_superfamily)

acc_test_superfamily = accuracy_score(y_test_superfamily, y_pred_test_superfamily)
f1_test_superfamily = f1_score(y_test_superfamily, y_pred_test_superfamily, average='macro')

print("Bag-of-trigrams - Superfamily Holdout Accuracy:", acc_test_superfamily * 100, "%")
print("Bag-of-trigrams - Superfamily Holdout Macro F1:", f1_test_superfamily * 100, "%")

clf_prot_superfamily = SVC(kernel='rbf', C=1.0)
clf_prot_superfamily.fit(X_prot2vec, y)
y_pred_test_superfamily = clf_prot_superfamily.predict(X_prot2vec_test_superfamily)
acc_test_superfamily = accuracy_score(y_test_superfamily, y_pred_test_superfamily)
f1_test_superfamily = f1_score(y_test_superfamily, y_pred_test_superfamily, average='macro')

print("Prot2Vec - Superfamily Holdout Accuracy:", acc_test_superfamily * 100, "%")
print("Prot2Vec - Superfamily Holdout Macro F1:", f1_test_superfamily * 100, "%")

Bag-of-trigrams - Superfamily Holdout Accuracy: 8.851674641148326 %
Bag-of-trigrams - Superfamily Holdout Macro F1: 0.8062545576169745 %
Prot2Vec - Superfamily Holdout Accuracy: 1.5948963317384368 %
Prot2Vec - Superfamily Holdout Macro F1: 0.008673252556441191 %


In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# ========== 1. Scale features ==========

scaler_trigrams = StandardScaler()
X_trigrams_scaled = scaler_trigrams.fit_transform(X_trigrams)
X_trigrams_valid_scaled = scaler_trigrams.transform(X_trigrams_valid)
X_trigrams_test_fold_scaled = scaler_trigrams.transform(X_trigrams_test)
X_trigrams_test_family_scaled = scaler_trigrams.transform(X_trigrams_test_family)
X_trigrams_test_superfamily_scaled = scaler_trigrams.transform(X_trigrams_test_superfamily)

# ========== 2. Define MLP model ==========

mlp_trigrams = MLPClassifier(hidden_layer_sizes=(512, 256, 128),
                             activation='relu',
                             solver='adam',
                             max_iter=300,
                             random_state=42,
                             early_stopping=True)

# ========== 3. Train on fold-level (original train set) ==========

mlp_trigrams.fit(X_trigrams_scaled, y)

# ========== 4. Evaluate fold-level validation ==========

y_pred_valid_tri = mlp_trigrams.predict(X_trigrams_valid_scaled)
acc_valid_tri = accuracy_score(y_valid, y_pred_valid_tri)
f1_valid_tri = f1_score(y_valid, y_pred_valid_tri, average='macro')

print("\n[MLP - Bag-of-trigrams]")
print("Fold Validation Accuracy:", acc_valid_tri * 100, "%")
print("Fold Validation Macro F1:", f1_valid_tri * 100, "%")

# ========== 5. Test fold-level holdout ==========

y_pred_test_fold_tri = mlp_trigrams.predict(X_trigrams_test_fold_scaled)
acc_test_fold_tri = accuracy_score(y_test, y_pred_test_fold_tri)
f1_test_fold_tri = f1_score(y_test, y_pred_test_fold_tri, average='macro')

print("Fold Test Accuracy:", acc_test_fold_tri * 100, "%")
print("Fold Test Macro F1:", f1_test_fold_tri * 100, "%")

# ========== 6. Test family-level holdout ==========

y_pred_test_family_tri = mlp_trigrams.predict(X_trigrams_test_family_scaled)
acc_test_family_tri = accuracy_score(y_test_family, y_pred_test_family_tri)
f1_test_family_tri = f1_score(y_test_family, y_pred_test_family_tri, average='macro')

print("\nFamily Test Accuracy:", acc_test_family_tri * 100, "%")
print("Family Test Macro F1:", f1_test_family_tri * 100, "%")

# ========== 7. Test superfamily-level holdout ==========

y_pred_test_superfamily_tri = mlp_trigrams.predict(X_trigrams_test_superfamily_scaled)
acc_test_superfamily_tri = accuracy_score(y_test_superfamily, y_pred_test_superfamily_tri)
f1_test_superfamily_tri = f1_score(y_test_superfamily, y_pred_test_superfamily_tri, average='macro')

print("\nSuperfamily Test Accuracy:", acc_test_superfamily_tri * 100, "%")
print("Superfamily Test Macro F1:", f1_test_superfamily_tri * 100, "%")


[MLP - Bag-of-trigrams]
Fold Validation Accuracy: 8.28804347826087 %
Fold Validation Macro F1: 2.6155704901919563 %
Fold Test Accuracy: 8.913649025069638 %
Fold Test Macro F1: 1.8501340899933771 %

Family Test Accuracy: 50.70754716981132 %
Family Test Macro F1: 27.86396633653463 %

Superfamily Test Accuracy: 8.054226475279107 %
Superfamily Test Macro F1: 2.2992274782393487 %
