In [7]:
from tape.datasets import FluorescenceDataset
import numpy as np
import pandas as pd

In [8]:
# Initialize datasets
data_path = 'D:\\University\\Semester_8\\CO549 Computational Bioengineering\\Project\\data_dir'

train_data = FluorescenceDataset(data_path=data_path, split='train')
valid_data = FluorescenceDataset(data_path=data_path, split='valid')
test_data = FluorescenceDataset(data_path=data_path, split='test')


In [9]:
from trigrams import get_bag_of_trigrams_reduced
from prot2vec import get_protvec_embedding
from alphared import reduce_alphabet

In [10]:
X_trigrams_train, X_prot2vec_train, y_train = [], [], []
X_trigrams_valid, X_prot2vec_valid, y_valid = [], [], []
X_trigrams_test, X_prot2vec_test, y_test = [], [], []

# Training data
for i in range(len(train_data)):
    sample = train_data.data[i]
    seq = sample['primary']
    reduced_seq = reduce_alphabet(seq,alphabet_size=10)
    label = float(sample['log_fluorescence'][0])

    X_trigrams_train.append(get_bag_of_trigrams_reduced(reduced_seq,alphabet_size=10))
    X_prot2vec_train.append(get_protvec_embedding(reduced_seq))
    y_train.append(label)

# Validation data
for i in range(len(valid_data)):
    sample = valid_data.data[i]
    seq = sample['primary']
    label = float(sample['log_fluorescence'][0])
    reduced_seq = reduce_alphabet(seq,alphabet_size=10)

    X_trigrams_valid.append(get_bag_of_trigrams_reduced(reduced_seq,alphabet_size = 10))
    X_prot2vec_valid.append(get_protvec_embedding(reduced_seq))
    y_valid.append(label)

# Test data
for i in range(len(test_data)):
    sample = test_data.data[i]
    seq = sample['primary']
    label = float(sample['log_fluorescence'][0])
    reduced_seq = reduce_alphabet(seq,alphabet_size=10)

    X_trigrams_test.append(get_bag_of_trigrams_reduced(reduced_seq, alphabet_size=10))
    X_prot2vec_test.append(get_protvec_embedding(reduced_seq))
    y_test.append(label)

# Convert to numpy arrays
X_trigrams_train = np.array(X_trigrams_train)
X_prot2vec_train = np.array(X_prot2vec_train)
y_train = np.array(y_train)

X_trigrams_valid = np.array(X_trigrams_valid)
X_prot2vec_valid = np.array(X_prot2vec_valid)
y_valid = np.array(y_valid)

X_trigrams_test = np.array(X_trigrams_test)
X_prot2vec_test = np.array(X_prot2vec_test)
y_test = np.array(y_test)


In [11]:
from sklearn.neural_network import MLPRegressor
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error


In [12]:
# Train MLP on Bag-of-trigrams

mlp_trigrams_fluoro = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=300, random_state=42)
mlp_trigrams_fluoro.fit(X_trigrams_train, y_train)

# Validation evaluation
y_pred_valid_tri = mlp_trigrams_fluoro.predict(X_trigrams_valid)
spearman_valid_tri, _ = spearmanr(y_valid, y_pred_valid_tri)
print("Bag-of-trigrams - Validation Spearman Rho:", spearman_valid_tri)
print("Bag-of-trigrams - Validation MSE:", mean_squared_error(y_valid, y_pred_valid_tri))
print("Bag-of-trigrams - Validation R^2:", mlp_trigrams_fluoro.score(X_trigrams_valid, y_valid))

# Test evaluation
y_pred_test_tri = mlp_trigrams_fluoro.predict(X_trigrams_test)
spearman_test_tri, _ = spearmanr(y_test, y_pred_test_tri)
print("Bag-of-trigrams - Test Spearman Rho:", spearman_test_tri)
print("Bag-of-trigrams - Test MSE:", mean_squared_error(y_test, y_pred_test_tri))
print("Bag-of-trigrams - Test R^2:", mlp_trigrams_fluoro.score(X_trigrams_test, y_test))

Bag-of-trigrams - Validation Spearman Rho: 0.6184977441389015
Bag-of-trigrams - Validation MSE: 0.2571913533746534
Bag-of-trigrams - Validation R^2: 0.6317591238385343
Bag-of-trigrams - Test Spearman Rho: 0.5698658204936233
Bag-of-trigrams - Test MSE: 0.9146218744089136
Bag-of-trigrams - Test R^2: 0.036285816258769255


In [13]:
# Train MLP on Prot2Vec

mlp_prot2vec_fluoro = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=300, random_state=42)
mlp_prot2vec_fluoro.fit(X_prot2vec_train, y_train)

# Validation evaluation(Spearman Rho, R^2, MSE)
y_pred_valid_pv = mlp_prot2vec_fluoro.predict(X_prot2vec_valid)
spearman_valid_pv, _ = spearmanr(y_valid, y_pred_valid_pv)
print("Prot2Vec - Validation Spearman Rho:", spearman_valid_pv)
print("Prot2Vec - Validation R^2:", mlp_prot2vec_fluoro.score(X_prot2vec_valid, y_valid))
print("Prot2Vec - Validation MSE:", mean_squared_error(y_valid, y_pred_valid_pv))

# Test evaluation
y_pred_test_pv = mlp_prot2vec_fluoro.predict(X_prot2vec_test)
spearman_test_pv, _ = spearmanr(y_test, y_pred_test_pv)
print("Prot2Vec - Test Spearman Rho:", spearman_test_pv)
print("Prot2Vec - Test R^2:", mlp_prot2vec_fluoro.score(X_prot2vec_test, y_test))
print("Prot2Vec - Test MSE:", mean_squared_error(y_test, y_pred_test_pv))


Prot2Vec - Validation Spearman Rho: 0.007356088894145153
Prot2Vec - Validation R^2: -0.000515454707174845
Prot2Vec - Validation MSE: 0.6987923952135178
Prot2Vec - Test Spearman Rho: 0.0058234229358271866
Prot2Vec - Test R^2: -1.296961163474239
Prot2Vec - Test MSE: 2.179952272390122
