In [1]:
# load the pridict, deepprime and conventional ML models
import os
import sys
import numpy as np
import pandas as pd
import pickle
import torch
import skorch

from models.deepprime import predict_deep_prime
from models.pridict import predict_pridict
from models.conventional_ml_models import MLP
from collections import defaultdict

# suppress warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

dataf = 'pd-hek293t-pe2'

data_dp = f'dp-{dataf}.csv'
data_pd = f'pd-{dataf}.csv'

prediction_dp, _ = predict_deep_prime(data_dp, num_features=24, dropout=0)
prediction_pd, _ = predict_pridict(data_pd, num_features=24, device='cuda' if torch.cuda.is_available() else 'cpu', dropout=0)

prediction_mlp = defaultdict(list)
prediction_rf = defaultdict(list)
prediction_xgb = defaultdict(list)
prediction_ridge = defaultdict(list)
prediction_lasso = defaultdict(list)

data = pd.read_csv(f'models/data/conventional-ml/ml-{dataf}.csv')
for i in range(5):
    fold = i + 1
    # load the test data
    X_test = data[data['fold'] == i].iloc[:, :24].values
    y_test = data[data['fold'] == i].iloc[:, -2].values
    # load the models
    with open(f'models/trained-models/conventional-ml/random_forest-{dataf}-fold-{fold}.pkl', 'rb') as f:
        rf = pickle.load(f)
    with open(f'models/trained-models/conventional-ml/xgboost-{dataf}-fold-{fold}.pkl', 'rb') as f:
        xgb = pickle.load(f)
    with open(f'models/trained-models/conventional-ml/ridge-{dataf}-fold-{fold}.pkl', 'rb') as f:
        ridge = pickle.load(f)
    with open(f'models/trained-models/conventional-ml/lasso-{dataf}-fold-{fold}.pkl', 'rb') as f:
        lasso = pickle.load(f)
    mlp_model = skorch.NeuralNetRegressor(
            module=MLP,
            device='cuda',
            module__hidden_layer_sizes = (64, 64,),
        )
    mlp_model.initialize()
    mlp_model.load_params(f_params=f'models/trained-models/conventional-ml/mlp-{dataf}-fold-{fold}.pkl')

    prediction_rf[i] = rf.predict(X_test)
    prediction_xgb[i] = xgb.predict(X_test)
    prediction_ridge[i] = ridge.predict(X_test)
    prediction_lasso[i] = lasso.predict(X_test)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    prediction_mlp[i] = mlp_model.predict(X_test)

In [3]:
# concatenate the predictions for each fold into a single array
prediction_pd_list = np.concatenate([prediction_pd[fold].flatten() for fold in range(5)])
prediction_dp_list = np.concatenate([prediction_dp[fold].flatten() for fold in range(5)])
prediction_rf_list = np.concatenate([prediction_rf[fold] for fold in range(5)])
prediction_xgb_list = np.concatenate([prediction_xgb[fold] for fold in range(5)])
prediction_ridge_list = np.concatenate([prediction_ridge[fold] for fold in range(5)])
prediction_lasso_list = np.concatenate([prediction_lasso[fold] for fold in range(5)])
prediction_mlp_list = np.concatenate([prediction_mlp[fold].flatten() for fold in range(5)])

print('Concatenated')

# calculate the squared error from the predictions
y_test = data.iloc[:, -2].values
mse_dp = (y_test - prediction_dp_list)**2
mse_pd = (y_test - prediction_pd_list)**2
mse_rf = (y_test - prediction_rf_list)**2
mse_xgb = (y_test - prediction_xgb_list)**2
mse_ridge = (y_test - prediction_ridge_list)**2
mse_lasso = (y_test - prediction_lasso_list)**2
mse_mlp = (y_test - prediction_mlp_list)**2

# normalize the squared error
mse_dp = mse_dp / np.sum(mse_dp)
mse_pd = mse_pd / np.sum(mse_pd)
mse_rf = mse_rf / np.sum(mse_rf)
mse_xgb = mse_xgb / np.sum(mse_xgb)
mse_ridge = mse_ridge / np.sum(mse_ridge)
mse_lasso = mse_lasso / np.sum(mse_lasso)
mse_mlp = mse_mlp / np.sum(mse_mlp)

# plot th normalized squared error
import matplotlib.pyplot as plt
plt.plot(mse_dp, label='DeepPrime')
plt.plot(mse_pd, label='Pridict')
plt.plot(mse_rf, label='Random Forest')
plt.plot(mse_xgb, label='XGBoost')
plt.plot(mse_ridge, label='Ridge')
plt.plot(mse_lasso, label='Lasso')
plt.plot(mse_mlp, label='MLP')
plt.legend()

# calculate the pearson correlation coefficient between each error and plot in heatmap
from scipy.stats import pearsonr

mse_dict = {
    'DeepPrime': mse_dp,
    'Pridict': mse_pd,
    'Random Forest': mse_rf,
    'XGBoost': mse_xgb,
    'Ridge': mse_ridge,
    'Lasso': mse_lasso,
    'MLP': mse_mlp,
}

correlation_matrix = np.zeros((5, 5))
for i, mse in enumerate(mse_dict.values()):
    for j, mse2 in enumerate(mse_dict.values()):
        correlation_matrix[i, j] = pearsonr(mse, mse2)[0]

plt.matshow(correlation_matrix, cmap='coolwarm')
plt.colorbar()
plt.xticks(range(len(mse_dict)), mse_dict.keys(), rotation=45)
plt.yticks(range(len(mse_dict)), mse_dict.keys())
# show true values on the heatmap
for i in range(len(mse_dict)):
    for j in range(len(mse_dict)):
        plt.text(j, i, f'{correlation_matrix[i, j]:.2f}', ha='center', va='center', color='black')

Concatenated
