In [1]:
from tape.datasets import FluorescenceDataset
import numpy as np
import pandas as pd

In [2]:
# Initialize datasets
data_path = 'D:\\University\\Semester_8\\CO549 Computational Bioengineering\\Project\\data_dir'

train_data = FluorescenceDataset(data_path=data_path, split='train')
valid_data = FluorescenceDataset(data_path=data_path, split='valid')
test_data = FluorescenceDataset(data_path=data_path, split='test')


In [3]:
from trigrams import get_bag_of_trigrams
from prot2vec import get_protvec_embedding

In [4]:
X_trigrams_train, X_prot2vec_train, y_train = [], [], []
X_trigrams_valid, X_prot2vec_valid, y_valid = [], [], []
X_trigrams_test, X_prot2vec_test, y_test = [], [], []

# Training data
for i in range(len(train_data)):
    sample = train_data.data[i]
    seq = sample['primary']
    label = float(sample['log_fluorescence'][0])

    X_trigrams_train.append(get_bag_of_trigrams(seq))
    X_prot2vec_train.append(get_protvec_embedding(seq))
    y_train.append(label)

# Validation data
for i in range(len(valid_data)):
    sample = valid_data.data[i]
    seq = sample['primary']
    label = float(sample['log_fluorescence'][0])

    X_trigrams_valid.append(get_bag_of_trigrams(seq))
    X_prot2vec_valid.append(get_protvec_embedding(seq))
    y_valid.append(label)

# Test data
for i in range(len(test_data)):
    sample = test_data.data[i]
    seq = sample['primary']
    label = float(sample['log_fluorescence'][0])

    X_trigrams_test.append(get_bag_of_trigrams(seq))
    X_prot2vec_test.append(get_protvec_embedding(seq))
    y_test.append(label)

# Convert to numpy arrays
X_trigrams_train = np.array(X_trigrams_train)
X_prot2vec_train = np.array(X_prot2vec_train)
y_train = np.array(y_train)

X_trigrams_valid = np.array(X_trigrams_valid)
X_prot2vec_valid = np.array(X_prot2vec_valid)
y_valid = np.array(y_valid)

X_trigrams_test = np.array(X_trigrams_test)
X_prot2vec_test = np.array(X_prot2vec_test)
y_test = np.array(y_test)


In [5]:
from sklearn.neural_network import MLPRegressor
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error


In [6]:
# Train MLP on Bag-of-trigrams

mlp_trigrams_fluoro = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=300, random_state=42)
mlp_trigrams_fluoro.fit(X_trigrams_train, y_train)

# Validation evaluation
y_pred_valid_tri = mlp_trigrams_fluoro.predict(X_trigrams_valid)
spearman_valid_tri, _ = spearmanr(y_valid, y_pred_valid_tri)
print("Bag-of-trigrams - Validation Spearman Rho:", spearman_valid_tri)
print("Bag-of-trigrams - Validation Mean Squared Error:", mean_squared_error(y_valid, y_pred_valid_tri))
print("Bag-of-trigrams - Validation R-squared:", mlp_trigrams_fluoro.score(X_trigrams_valid, y_valid))

# Test evaluation
y_pred_test_tri = mlp_trigrams_fluoro.predict(X_trigrams_test)
spearman_test_tri, _ = spearmanr(y_test, y_pred_test_tri)
print("Bag-of-trigrams - Test Spearman Rho:", spearman_test_tri)
print("Bag-of-trigrams - Test Mean Squared Error:", mean_squared_error(y_test, y_pred_test_tri))
print("Bag-of-trigrams - Test R-squared:", mlp_trigrams_fluoro.score(X_trigrams_test, y_test))


Bag-of-trigrams - Validation Spearman Rho: 0.8013430546494278
Bag-of-trigrams - Validation Mean Squared Error: 0.0833773926399537
Bag-of-trigrams - Validation R-squared: 0.8806220982356675
Bag-of-trigrams - Test Spearman Rho: 0.6725112674631322
Bag-of-trigrams - Test Mean Squared Error: 0.23994056407782402
Bag-of-trigrams - Test R-squared: 0.7471806313334581


In [7]:
# Train MLP on Prot2Vec

mlp_prot2vec_fluoro = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=300, random_state=42)
mlp_prot2vec_fluoro.fit(X_prot2vec_train, y_train)

# Validation evaluation(Spearman Rho, R^2, MSE)
y_pred_valid_pv = mlp_prot2vec_fluoro.predict(X_prot2vec_valid)
spearman_valid_pv, _ = spearmanr(y_valid, y_pred_valid_pv)
print("Prot2Vec - Validation Spearman Rho:", spearman_valid_pv)
print("Prot2Vec - Validation R^2:", mlp_prot2vec_fluoro.score(X_prot2vec_valid, y_valid))
print("Prot2Vec - Validation MSE:", mean_squared_error(y_valid, y_pred_valid_pv))

# Test evaluation
y_pred_test_pv = mlp_prot2vec_fluoro.predict(X_prot2vec_test)
spearman_test_pv, _ = spearmanr(y_test, y_pred_test_pv)
print("Prot2Vec - Test Spearman Rho:", spearman_test_pv)
print("Prot2Vec - Test R^2:", mlp_prot2vec_fluoro.score(X_prot2vec_test, y_test))
print("Prot2Vec - Test MSE:", mean_squared_error(y_test, y_pred_test_pv))


Prot2Vec - Validation Spearman Rho: 0.36894732002516906
Prot2Vec - Validation R^2: 0.2967956432758345
Prot2Vec - Validation MSE: 0.4911406959762344
Prot2Vec - Test Spearman Rho: 0.3926707200861207
Prot2Vec - Test R^2: -0.41318588449757065
Prot2Vec - Test MSE: 1.3411971561419367


In [8]:
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

# Train Random Forest on Prot2Vec
rf_prot2vec_fluoro = RandomForestRegressor(n_estimators=100, random_state=42)
rf_prot2vec_fluoro.fit(X_prot2vec_train, y_train)

# Validation evaluation (Spearman Rho, R^2, MSE)
y_pred_valid_rf_pv = rf_prot2vec_fluoro.predict(X_prot2vec_valid)
spearman_valid_rf_pv, _ = spearmanr(y_valid, y_pred_valid_rf_pv)
print("Prot2Vec (Random Forest) - Validation Spearman Rho:", spearman_valid_rf_pv)
print("Prot2Vec (Random Forest) - Validation R^2:", rf_prot2vec_fluoro.score(X_prot2vec_valid, y_valid))
print("Prot2Vec (Random Forest) - Validation MSE:", mean_squared_error(y_valid, y_pred_valid_rf_pv))

# Test evaluation
y_pred_test_rf_pv = rf_prot2vec_fluoro.predict(X_prot2vec_test)
spearman_test_rf_pv, _ = spearmanr(y_test, y_pred_test_rf_pv)
print("Prot2Vec (Random Forest) - Test Spearman Rho:", spearman_test_rf_pv)
print("Prot2Vec (Random Forest) - Test R^2:", rf_prot2vec_fluoro.score(X_prot2vec_test, y_test))
print("Prot2Vec (Random Forest) - Test MSE:", mean_squared_error(y_test, y_pred_test_rf_pv))


Prot2Vec (Random Forest) - Validation Spearman Rho: 0.49002097469947853
Prot2Vec (Random Forest) - Validation R^2: 0.34390794687018034
Prot2Vec (Random Forest) - Validation MSE: 0.4582359374162033
Prot2Vec (Random Forest) - Test Spearman Rho: 0.45469886914066443
Prot2Vec (Random Forest) - Test R^2: -0.08694312198494658
Prot2Vec (Random Forest) - Test MSE: 1.0315734399035137
