In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#@title Imports
import torch
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from sklearn.metrics import ndcg_score
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, normalize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics import ndcg_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, SVC
from sklearn.linear_model import Ridge
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import f1_score
from scipy.stats import sem
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

# Performs one round of 5-fold cross validation
def tfcv(X, y, MODEL):
  y = np.array(y)
  acc, f1, pre, rec, auroc = [], [], [], [], []
  kf = StratifiedKFold(n_splits=5, shuffle=True)
  for train_index, test_index in kf.split(X, y):
    # Get the train/val data for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # PCA was removed based off reveiwer's comment
    # pca = PCA(n_components=50)
    # X_train = pca.fit_transform(X_train)
    # X_test = pca.transform(X_test)

    scaler=StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)

    MODEL.fit(X_train, y_train)
    y_pred = MODEL.predict(X_test)

    # Compute metrics for current fold
    fold_acc = accuracy_score(y_test, y_pred)
    fold_f1 = f1_score(y_test, y_pred)
    fold_pre = precision_score(y_test, y_pred)
    fold_rec = recall_score(y_test, y_pred)
    fold_roc = roc_auc_score(y_test, y_pred)
    acc.append(fold_acc)
    f1.append(fold_f1)
    pre.append(fold_pre)
    rec.append(fold_rec)
    auroc.append(fold_roc)

  return np.mean(np.array(acc)), np.mean(np.array(f1)), np.mean(np.array(pre)), np.mean(np.array(rec)), np.mean(np.array(auroc))

# Optimize a hyperparameter
def optimize(X, y, MODEL, PARAM, param_values):
    top_value = param_values[0]
    top_score = 0
    for i in tqdm(param_values):
        currentscore, _, _, _, _ = tfcv(X, y, MODEL(**{PARAM: i}, random_state=1))
        if currentscore > top_score:
            top_score = currentscore
            top_value = i

    final_acc, final_f1, final_pre, final_rec, final_auroc = [], [], [], [], []
    for i in range(5):
        ac, f1, pre, rec, auroc = tfcv(X, y, MODEL(**{PARAM: top_value}, random_state=1))
        final_acc.append(ac)
        final_f1.append(f1)
        final_pre.append(pre)
        final_rec.append(rec)
        final_auroc.append(auroc)
    final_acc = np.array(final_acc)
    final_f1 = np.array(final_f1)
    final_pre = np.array(final_pre)
    final_rec = np.array(final_rec)
    final_auroc = np.array(final_auroc)
    print(f'Accuracy: {np.mean(final_acc)}, +/- {sem(final_acc)}')
    print(f'F1: {np.mean(final_f1)}, +/- {sem(final_f1)}')
    print(f'Precision: {np.mean(final_pre)}, +/- {sem(final_pre)}')
    print(f'Recall: {np.mean(final_rec)}, +/- {sem(final_rec)}')
    print(f'Auroc: {np.mean(final_auroc)}, +/- {sem(final_auroc)}')
    print(f'top_value: {top_value}')
    return np.mean(final_acc), sem(final_acc)

# Load sequences from csv
df = pd.read_csv('../Data/LazBF_sequences.csv')
LazBF_sequences = df['sequences'].tolist()
LazBF_labels = np.array(df['labels'].tolist())

df = pd.read_csv('../Data/LazBF_sample.csv')
LazBF_sample = df['sequences'].tolist()
LazBF_sample_labels = np.array(df['labels'].tolist())

df = pd.read_csv('../Data/LazDEF_sequences.csv')
LazDEF_sequences = df['sequences'].tolist()
LazDEF_labels = np.array(df['labels'].tolist())

df = pd.read_csv('../Data/LazDEF_sample.csv')
LazDEF_sample = df['sequences'].tolist()
LazDEF_sample_labels = np.array(df['labels'].tolist())

# Load Embs
lazbf_mlm_none = np.load("../Embeddings/LazBF_mlm_none.npy")
lazdef_mlm_none = np.load("../Embeddings/LazDEF_mlm_none.npy")

lazbf_mlm_pa = np.load("../Embeddings/LazBF_mlm_PA.npy")
lazdef_mlm_pa = np.load("../Embeddings/LazDEF_mlm_PA.npy")

lazbf_mlm_lazbf = np.load("../Embeddings/LazBF_mlm_LazBF.npy")
lazdef_mlm_lazbf = np.load("../Embeddings/LazDEF_mlm_LazBF.npy")

lazbf_mlm_lazdef = np.load("../Embeddings/LazBF_mlm_LazDEF.npy")
# lazbf_mlm_lazdefLR04 = np.load("../Embeddings/LazBF_mlm_LazDEF_lr0.npy") # LR 3e-4 embeddings
# lazbf_mlm_lazdefLR05 = np.load("../Embeddings/LazBF_mlm_LazDEF_lr1.npy") # LR 3e-5 embeddings

lazdef_mlm_lazdef = np.load("../Embeddings/LazDEF_mlm_LazDEF.npy")
# lazdef_mlm_lazdefLR04 = np.load("../Embeddings/LazDEF_mlm_LazDEF_lr0.npy") # LR 3e-4 embeddings
# lazdef_mlm_lazdefLR05 = np.load("../Embeddings/LazDEF_mlm_LazDEF_lr1.npy") # LR 3e-5 embeddings

lazbf_mlm_lazbcdef = np.load("../Embeddings/LazBF_mlm_LazBCDEF.npy")
lazdef_mlm_lazbcdef = np.load("../Embeddings/LazDEF_mlm_LazBCDEF.npy")

# Hyperparameter grid
model_list = [LogisticRegression, RandomForestClassifier, AdaBoostClassifier, SVC, MLPClassifier]
param_list = ['C', 'n_estimators', 'n_estimators', 'C', 'hidden_layer_sizes']
value_list = [
   [1e-2, 1e-1, 1, 5], # LR
  #  [5, 25, 50, 80], # KNN was removed due to poor performance
   [5, 25, 50, 100], # RF
   [5, 25, 50, 100], # Ada
   [1e-2, 1e-1, 1, 5], # SVC
   [50, 100, 200, 500], # MLP
]

def balanced_sample_np(seqs, labels, N, seed):
    labels = np.array(labels)
    indices_0 = np.where(labels == 0)[0]
    indices_1 = np.where(labels == 1)[0]
    min_count = min(len(indices_0), len(indices_1), N // 2)
    sampled_indices_0 = np.random.choice(indices_0, min_count, replace=False)
    sampled_indices_1 = np.random.choice(indices_1, min_count, replace=False)
    sampled_indices = np.concatenate((sampled_indices_0, sampled_indices_1))
    np.random.seed(seed)
    random.seed(seed)
    np.random.shuffle(sampled_indices)
    return sampled_indices

In [19]:
#@title LazDEF LowN
np.random.seed(42)
random.seed(42)
from sklearn.utils import check_random_state
random_state = check_random_state(42)

idxs = balanced_sample_np(lazbf_mlm_none, LazBF_sample_labels, 200, 42)

y_values = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]
y_errors = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]

i = 0
for model, param, grid in zip(model_list, param_list, value_list):

  # Lazdef prediction
  print(f"Low-N, LazDEF-task, Vanilla-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_none[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[0][i] = m
  y_errors[0][i] = e

  print(f"Low-N, LazDEF-task, Peptide-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_pa[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[1][i] = m
  y_errors[1][i] = e

  print(f"Low-N, LazDEF-task, LazBF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazbf[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[2][i] = m
  y_errors[2][i] = e

  print(f"Low-N, LazDEF-task, LazDEF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazdef[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[3][i] = m
  y_errors[3][i] = e

  print(f"Low-N, LazDEF-task, LazBCDEF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazbcdef[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[4][i] = m
  y_errors[4][i] = e

  i += 1
print(y_values)
print(y_errors)

Low-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  3.99it/s]


Accuracy: 0.7389999999999999, +/- 0.004
F1: 0.7399777184318039, +/- 0.005525625059016988
Precision: 0.7436258564725383, +/- 0.005393753509868707
Recall: 0.746, +/- 0.010295630140986995
Auroc: 0.7390000000000001, +/- 0.00400000000000001
top_value: 0.01
Low-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  3.99it/s]


Accuracy: 0.7620000000000001, +/- 0.009565563234854496
F1: 0.7589466363821733, +/- 0.010120926688566956
Precision: 0.7688851047249218, +/- 0.011237464139382787
Recall: 0.758, +/- 0.011135528725660053
Auroc: 0.7619999999999999, +/- 0.009565563234854503
top_value: 0.1
Low-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.02it/s]


Accuracy: 0.804, +/- 0.0074833147735478955
F1: 0.7979537289732513, +/- 0.008310188435131404
Precision: 0.8164555803897573, +/- 0.011893685160067339
Recall: 0.79, +/- 0.008944271909999154
Auroc: 0.804, +/- 0.007483314773547894
top_value: 0.01
Low-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  3.57it/s]


Accuracy: 0.9490000000000001, +/- 0.0033166247903553808
F1: 0.9493812676343648, +/- 0.003566291075739647
Precision: 0.9412455411954672, +/- 0.0032558048240550717
Recall: 0.96, +/- 0.005477225575051646
Auroc: 0.9490000000000001, +/- 0.0033166247903553808
top_value: 0.01
Low-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  3.74it/s]


Accuracy: 0.774, +/- 0.006964194138592072
F1: 0.773093876484807, +/- 0.008620177446833108
Precision: 0.773687240460292, +/- 0.004359385480511035
Recall: 0.7779999999999999, +/- 0.014628738838327781
Auroc: 0.774, +/- 0.006964194138592072
top_value: 0.1
Low-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


Accuracy: 0.7110000000000001, +/- 0.007648529270389171
F1: 0.7107279444562187, +/- 0.007316894079230398
Precision: 0.7097711984564993, +/- 0.009416010443162696
Recall: 0.716, +/- 0.005099019513592783
Auroc: 0.7110000000000001, +/- 0.007648529270389171
top_value: 100
Low-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:03<00:00,  1.05it/s]


Accuracy: 0.7110000000000001, +/- 0.0053385391260156526
F1: 0.7030307381791102, +/- 0.006735620018662061
Precision: 0.7302158305716657, +/- 0.007550401740084453
Recall: 0.6900000000000001, +/- 0.00836660026534075
Auroc: 0.7110000000000001, +/- 0.005338539126015659
top_value: 100
Low-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:03<00:00,  1.08it/s]


Accuracy: 0.7740000000000001, +/- 0.0053385391260156595
F1: 0.7751162720322632, +/- 0.004823072549529066
Precision: 0.771089226308906, +/- 0.005963251848460718
Recall: 0.7860000000000001, +/- 0.0040000000000000036
Auroc: 0.7740000000000001, +/- 0.0053385391260156595
top_value: 100
Low-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Accuracy: 0.9440000000000002, +/- 0.0018708286933869513
F1: 0.944657970901336, +/- 0.001744326586657569
Precision: 0.9338246076323878, +/- 0.00021250830894575608
Recall: 0.958, +/- 0.0037416573867739447
Auroc: 0.944, +/- 0.0018708286933869483
top_value: 25
Low-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:05<00:00,  1.33s/it]


Accuracy: 0.71, +/- 0.00790569415042097
F1: 0.7068145502220793, +/- 0.0070511123370003105
Precision: 0.7195338405032846, +/- 0.01109333824406247
Recall: 0.7020000000000001, +/- 0.007348469228349553
Auroc: 0.71, +/- 0.007905694150420972
top_value: 25
Low-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:30<00:00,  7.70s/it]


Accuracy: 0.717, +/- 0.009165151389911676
F1: 0.7172064763470768, +/- 0.011225429858893953
Precision: 0.7203310851365772, +/- 0.007075806820974103
Recall: 0.724, +/- 0.017204650534085254
Auroc: 0.717, +/- 0.009165151389911691
top_value: 100
Low-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:30<00:00,  7.72s/it]


Accuracy: 0.7229999999999999, +/- 0.016093476939431105
F1: 0.7224597158685019, +/- 0.016378802564009184
Precision: 0.7195621791663116, +/- 0.016538747650922992
Recall: 0.736, +/- 0.018867962264113195
Auroc: 0.7229999999999999, +/- 0.016093476939431084
top_value: 25
Low-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:31<00:00,  7.88s/it]


Accuracy: 0.7230000000000001, +/- 0.007176350047203669
F1: 0.7294041174812924, +/- 0.006930461986071933
Precision: 0.7210979775901298, +/- 0.010408059432581985
Recall: 0.748, +/- 0.00860232526704262
Auroc: 0.7230000000000001, +/- 0.0071763500472036765
top_value: 25
Low-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:30<00:00,  7.67s/it]


Accuracy: 0.9260000000000002, +/- 0.004301162633521304
F1: 0.9261518392096055, +/- 0.004423921664625363
Precision: 0.9225430242272348, +/- 0.0035919774180353872
Recall: 0.932, +/- 0.008602325267042622
Auroc: 0.9259999999999999, +/- 0.004301162633521347
top_value: 25
Low-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:30<00:00,  7.65s/it]


Accuracy: 0.701, +/- 0.016232683080747937
F1: 0.694965039728416, +/- 0.01719815223103731
Precision: 0.7063777619697124, +/- 0.017624163625963165
Recall: 0.692, +/- 0.019078784028338906
Auroc: 0.701, +/- 0.016232683080747958
top_value: 50
Low-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  5.79it/s]


Accuracy: 0.718, +/- 0.009823441352194252
F1: 0.7180276403177558, +/- 0.011386244639184705
Precision: 0.7186635921354186, +/- 0.010649238614028158
Recall: 0.722, +/- 0.014966629547095767
Auroc: 0.718, +/- 0.009823441352194238
top_value: 1
Low-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  6.16it/s]


Accuracy: 0.7060000000000001, +/- 0.010049875621120889
F1: 0.7067596100290923, +/- 0.00955815290371746
Precision: 0.7051904845218347, +/- 0.011265550337894963
Recall: 0.716, +/- 0.010295630140987007
Auroc: 0.7060000000000001, +/- 0.010049875621120889
top_value: 5
Low-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  5.97it/s]


Accuracy: 0.783, +/- 0.005147815070493482
F1: 0.7892079274241407, +/- 0.005016274612634112
Precision: 0.7697630510179709, +/- 0.00484125259310091
Recall: 0.8160000000000001, +/- 0.005099019513592761
Auroc: 0.783, +/- 0.005147815070493501
top_value: 1
Low-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  7.07it/s]


Accuracy: 0.95, +/- 0.0
F1: 0.9510646978281692, +/- 0.0001480759398167369
Precision: 0.934620781202017, +/- 0.0007247733105255653
Recall: 0.97, +/- 0.0
Auroc: 0.95, +/- 0.0
top_value: 0.1
Low-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  5.87it/s]


Accuracy: 0.743, +/- 0.007681145747868606
F1: 0.7428208473849793, +/- 0.008861826747191446
Precision: 0.7526797631851206, +/- 0.00629419091704611
Recall: 0.744, +/- 0.012489995996796812
Auroc: 0.743, +/- 0.007681145747868612
top_value: 5
Low-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:11<00:00,  2.82s/it]


Accuracy: 0.743, +/- 0.008000000000000004
F1: 0.7475450481281705, +/- 0.008493823687993137
Precision: 0.738954197637264, +/- 0.005700862238524244
Recall: 0.7619999999999999, +/- 0.011575836902790213
Auroc: 0.743, +/- 0.007999999999999986
top_value: 500
Low-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:09<00:00,  2.44s/it]


Accuracy: 0.752, +/- 0.005612486080160898
F1: 0.7531431617370656, +/- 0.0064378897229050765
Precision: 0.7537851587039365, +/- 0.005216790005606379
Recall: 0.76, +/- 0.01140175425099139
Auroc: 0.752, +/- 0.005612486080160901
top_value: 500
Low-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:11<00:00,  2.84s/it]


Accuracy: 0.768, +/- 0.01044030650891052
F1: 0.7688714249689262, +/- 0.010865894264330716
Precision: 0.7688978554656034, +/- 0.013221810624342833
Recall: 0.776, +/- 0.013999999999999969
Auroc: 0.7680000000000001, +/- 0.010440306508910546
top_value: 100
Low-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:10<00:00,  2.59s/it]


Accuracy: 0.932, +/- 0.001224744871391563
F1: 0.9330629084269685, +/- 0.0012169405888758443
Precision: 0.9261151319229123, +/- 0.0014899901603156864
Recall: 0.944, +/- 0.002449489742783171
Auroc: 0.932, +/- 0.001224744871391563
top_value: 50
Low-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:16<00:00,  4.02s/it]


Accuracy: 0.7609999999999999, +/- 0.005099019513592807
F1: 0.7564071663580874, +/- 0.006652824441700825
Precision: 0.7765156813991112, +/- 0.0011884981917575335
Recall: 0.744, +/- 0.012083045973594582
Auroc: 0.7609999999999999, +/- 0.005099019513592796
top_value: 100
[[0.7389999999999999, 0.7110000000000001, 0.717, 0.718, 0.743], [0.7620000000000001, 0.7110000000000001, 0.7229999999999999, 0.7060000000000001, 0.752], [0.804, 0.7740000000000001, 0.7230000000000001, 0.783, 0.768], [0.9490000000000001, 0.9440000000000002, 0.9260000000000002, 0.95, 0.932], [0.774, 0.71, 0.701, 0.743, 0.7609999999999999]]
[[0.004, 0.007648529270389171, 0.009165151389911676, 0.009823441352194252, 0.008000000000000004], [0.009565563234854496, 0.0053385391260156526, 0.016093476939431105, 0.010049875621120889, 0.005612486080160898], [0.0074833147735478955, 0.0053385391260156595, 0.007176350047203669, 0.005147815070493482, 0.01044030650891052], [0.0033166247903553808, 0.0018708286933869513, 0.004301162633521304,

In [6]:
#@title LazDEF MedN
np.random.seed(42)
random.seed(42)
from sklearn.utils import check_random_state
random_state = check_random_state(42)

idxs = balanced_sample_np(lazbf_mlm_none, LazBF_sample_labels, 500, 42)

y_values = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]
y_errors = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]

i = 0
for model, param, grid in zip(model_list, param_list, value_list):

  # Lazdef prediction
  print(f"Med-N, LazDEF-task, Vanilla-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_none[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[0][i] = m
  y_errors[0][i] = e

  print(f"Med-N, LazDEF-task, Peptide-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_pa[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[1][i] = m
  y_errors[1][i] = e

  print(f"Med-N, LazDEF-task, LazBF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazbf[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[2][i] = m
  y_errors[2][i] = e

  print(f"Med-N, LazDEF-task, LazDEF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazdef[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[3][i] = m
  y_errors[3][i] = e

  print(f"Med-N, LazDEF-task, LazBCDEF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazbcdef[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[4][i] = m
  y_errors[4][i] = e

  i += 1
print(y_values)
print(y_errors)

Med-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.33it/s]


Accuracy: 0.8228, +/- 0.0033823069050575674
F1: 0.8238244519536856, +/- 0.004224457694936874
Precision: 0.8201354701399846, +/- 0.0014735687910653488
Recall: 0.8304, +/- 0.008634813257969162
Auroc: 0.8228, +/- 0.00338230690505754
top_value: 0.1
Med-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.46it/s]


Accuracy: 0.82, +/- 0.0016733200530681391
F1: 0.8196772886984729, +/- 0.0015859515351754206
Precision: 0.8213516621812031, +/- 0.003977422784700294
Recall: 0.8224, +/- 0.005455272678794333
Auroc: 0.82, +/- 0.0016733200530681391
top_value: 0.1
Med-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.60it/s]


Accuracy: 0.8583999999999999, +/- 0.004874423042781561
F1: 0.8606006914619341, +/- 0.0049864278655176045
Precision: 0.8491680635771784, +/- 0.004732486183391774
Recall: 0.8744, +/- 0.006273754856543241
Auroc: 0.8583999999999999, +/- 0.004874423042781561
top_value: 0.01
Med-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.89it/s]


Accuracy: 0.9515999999999998, +/- 0.0023151673805580455
F1: 0.9516916854749005, +/- 0.0024015179781918168
Precision: 0.9504339420986122, +/- 0.0024105458135728047
Recall: 0.9544, +/- 0.0029933259094191526
Auroc: 0.9516, +/- 0.0023151673805580663
top_value: 0.01
Med-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.88it/s]


Accuracy: 0.844, +/- 0.007293833011524208
F1: 0.8449391369035263, +/- 0.007351088783665696
Precision: 0.8385217739528148, +/- 0.006739847725091841
Recall: 0.8528, +/- 0.008708616422830932
Auroc: 0.844, +/- 0.007293833011524228
top_value: 0.1
Med-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:10<00:00,  2.63s/it]


Accuracy: 0.7512000000000001, +/- 0.003382306905057547
F1: 0.7480877515461618, +/- 0.0034433631395550776
Precision: 0.758177227128668, +/- 0.004214294702872652
Recall: 0.7416, +/- 0.005741080037762917
Auroc: 0.7512000000000001, +/- 0.0033823069050575552
top_value: 100
Med-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:09<00:00,  2.49s/it]


Accuracy: 0.77, +/- 0.004816637831516922
F1: 0.7649046896282904, +/- 0.005467413136378752
Precision: 0.7833873748781477, +/- 0.006304452918249785
Recall: 0.7512, +/- 0.0073102667529988405
Auroc: 0.77, +/- 0.00481663783151691
top_value: 50
Med-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:10<00:00,  2.65s/it]


Accuracy: 0.8012, +/- 0.0026532998322843456
F1: 0.8055064819235108, +/- 0.00298431067268399
Precision: 0.7894658482667463, +/- 0.0018838064379615108
Recall: 0.8248000000000001, +/- 0.00557135531087363
Auroc: 0.8012, +/- 0.0026532998322843196
top_value: 100
Med-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


Accuracy: 0.9559999999999998, +/- 0.0008944271909999167
F1: 0.9563821345843143, +/- 0.0008644693140523081
Precision: 0.9494399930912536, +/- 0.0010352056259707145
Recall: 0.9640000000000001, +/- 0.0012649110640673528
Auroc: 0.9560000000000001, +/- 0.0008944271909998918
top_value: 100
Med-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:09<00:00,  2.38s/it]


Accuracy: 0.7668000000000001, +/- 0.006248199740725329
F1: 0.7665294692594042, +/- 0.006154908501303382
Precision: 0.767933678533593, +/- 0.008527509146026756
Recall: 0.768, +/- 0.005932958789676524
Auroc: 0.7668000000000001, +/- 0.006248199740725329
top_value: 100
Med-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:20<00:00, 20.06s/it]


Accuracy: 0.7348, +/- 0.008064738061462374
F1: 0.733360732252393, +/- 0.00914341608525617
Precision: 0.7383210072777056, +/- 0.008048967168900054
Recall: 0.7319999999999999, +/- 0.01232882800593796
Auroc: 0.7348000000000001, +/- 0.008064738061462375
top_value: 50
Med-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:18<00:00, 19.59s/it]


Accuracy: 0.7671999999999999, +/- 0.007059745037889142
F1: 0.768306099534122, +/- 0.006761863458684522
Precision: 0.7659354497363564, +/- 0.007014241425616104
Recall: 0.7736000000000001, +/- 0.006997142273814367
Auroc: 0.7672000000000001, +/- 0.00705974503788911
top_value: 100
Med-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:20<00:00, 20.05s/it]


Accuracy: 0.7992000000000001, +/- 0.006183849933496142
F1: 0.8006983385260618, +/- 0.005548910846117608
Precision: 0.7946515095605843, +/- 0.00804845629228366
Recall: 0.8088, +/- 0.004079215610874238
Auroc: 0.7991999999999999, +/- 0.006183849933496143
top_value: 100
Med-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:19<00:00, 19.87s/it]


Accuracy: 0.954, +/- 0.002097617696340305
F1: 0.9541411266654503, +/- 0.0021839303936063074
Precision: 0.9512380945621617, +/- 0.0027286603530782974
Recall: 0.9576, +/- 0.004308131845707595
Auroc: 0.954, +/- 0.002097617696340305
top_value: 100
Med-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:19<00:00, 19.80s/it]


Accuracy: 0.7555999999999999, +/- 0.0047916594202843865
F1: 0.7580062564474408, +/- 0.0053450417248107635
Precision: 0.7504004165825785, +/- 0.004110908110452678
Recall: 0.7687999999999999, +/- 0.007939773296511667
Auroc: 0.7555999999999999, +/- 0.0047916594202843865
top_value: 100
Med-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:03<00:00,  1.24it/s]


Accuracy: 0.7836000000000001, +/- 0.003600000000000003
F1: 0.7871467299340537, +/- 0.003762025425439143
Precision: 0.7756812045289125, +/- 0.003429194776861904
Recall: 0.8008, +/- 0.00480000000000001
Auroc: 0.7836000000000001, +/- 0.003600000000000003
top_value: 1
Med-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.64it/s]


Accuracy: 0.7863999999999999, +/- 0.0027856776554368514
F1: 0.7885662254432733, +/- 0.0030344045015438933
Precision: 0.7846840275375282, +/- 0.0061069628823090085
Recall: 0.7968, +/- 0.004630334761116096
Auroc: 0.7863999999999999, +/- 0.0027856776554368514
top_value: 1
Med-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.69it/s]


Accuracy: 0.8284, +/- 0.0036000000000000042
F1: 0.8310041457838535, +/- 0.0036006270743420285
Precision: 0.8173383144027504, +/- 0.005368631900472438
Recall: 0.8488, +/- 0.00542586398650022
Auroc: 0.8284, +/- 0.0035999999999999747
top_value: 1
Med-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.99it/s]


Accuracy: 0.9560000000000001, +/- 8.599750569898515e-17
F1: 0.9565752611527515, +/- 9.086284434226312e-05
Precision: 0.9460087981652908, +/- 0.00045601411890192246
Recall: 0.968, +/- 0.0
Auroc: 0.9559999999999998, +/- 4.965068306494546e-17
top_value: 0.1
Med-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.66it/s]


Accuracy: 0.8144, +/- 0.004578209256903825
F1: 0.8173454282610388, +/- 0.004362197630067833
Precision: 0.806874998969471, +/- 0.004766628730238804
Recall: 0.8304, +/- 0.004664761515876244
Auroc: 0.8144, +/- 0.004578209256903863
top_value: 5
Med-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:27<00:00,  6.95s/it]


Accuracy: 0.7868, +/- 0.006651315659326361
F1: 0.7886874216947789, +/- 0.0065146906664493645
Precision: 0.7823578994527683, +/- 0.007162024971184331
Recall: 0.7984, +/- 0.008158431221748447
Auroc: 0.7868, +/- 0.0066513156593263475
top_value: 500
Med-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:27<00:00,  6.92s/it]


Accuracy: 0.7964, +/- 0.005564171097297435
F1: 0.7961320908398637, +/- 0.006017978546437588
Precision: 0.8006639107632306, +/- 0.004178575774139999
Recall: 0.7951999999999999, +/- 0.008616263691415197
Auroc: 0.7963999999999999, +/- 0.005564171097297401
top_value: 50
Med-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:28<00:00,  7.03s/it]


Accuracy: 0.8535999999999999, +/- 0.003544009029333893
F1: 0.8561159297776193, +/- 0.00323554598900907
Precision: 0.8438812908489147, +/- 0.005144270100644205
Recall: 0.8712, +/- 0.0023323807579381183
Auroc: 0.8535999999999999, +/- 0.003544009029333836
top_value: 100
Med-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:20<00:00,  5.04s/it]


Accuracy: 0.9508000000000001, +/- 0.002059126028197403
F1: 0.9508332801218282, +/- 0.0019213427384850536
Precision: 0.9502893687815748, +/- 0.00393847569908117
Recall: 0.9527999999999999, +/- 0.0007999999999999784
Auroc: 0.9508000000000001, +/- 0.002059126028197394
top_value: 50
Med-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:40<00:00, 10.12s/it]


Accuracy: 0.82, +/- 0.004898979485566342
F1: 0.8219681984498457, +/- 0.005480876217996082
Precision: 0.8123568021273035, +/- 0.0034937695184307036
Recall: 0.8336, +/- 0.0077562877718661315
Auroc: 0.8200000000000001, +/- 0.00489897948556637
top_value: 50
[[0.8228, 0.7512000000000001, 0.7348, 0.7836000000000001, 0.7868], [0.82, 0.77, 0.7671999999999999, 0.7863999999999999, 0.7964], [0.8583999999999999, 0.8012, 0.7992000000000001, 0.8284, 0.8535999999999999], [0.9515999999999998, 0.9559999999999998, 0.954, 0.9560000000000001, 0.9508000000000001], [0.844, 0.7668000000000001, 0.7555999999999999, 0.8144, 0.82]]
[[0.0033823069050575674, 0.003382306905057547, 0.008064738061462374, 0.003600000000000003, 0.006651315659326361], [0.0016733200530681391, 0.004816637831516922, 0.007059745037889142, 0.0027856776554368514, 0.005564171097297435], [0.004874423042781561, 0.0026532998322843456, 0.006183849933496142, 0.0036000000000000042, 0.003544009029333893], [0.0023151673805580455, 0.0008944271909999167

In [7]:
#@title LazDEF HighN
np.random.seed(42)
random.seed(42)
from sklearn.utils import check_random_state
random_state = check_random_state(42)

idxs = balanced_sample_np(lazbf_mlm_none, LazBF_sample_labels, 1000, 42)

y_values = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]
y_errors = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]

i = 0
for model, param, grid in zip(model_list, param_list, value_list):

  # Lazdef prediction
  print(f"High-N, LazDEF-task, Vanilla-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_none[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[0][i] = m
  y_errors[0][i] = e

  print(f"High-N, LazDEF-task, Peptide-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_pa[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[1][i] = m
  y_errors[1][i] = e

  print(f"High-N, LazDEF-task, LazBF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazbf[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[2][i] = m
  y_errors[2][i] = e

  print(f"High-N, LazDEF-task, LazDEF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazdef[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[3][i] = m
  y_errors[3][i] = e

  print(f"High-N, LazDEF-task, LazBCDEF-ESM-Embeddings {model}")
  m, e = optimize(lazdef_mlm_lazbcdef[idxs], LazDEF_sample_labels[idxs], model, param, grid)
  y_values[4][i] = m
  y_errors[4][i] = e

  i += 1
print(y_values)
print(y_errors)

High-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Accuracy: 0.8674, +/- 0.0030099833886584673
F1: 0.8683821037038282, +/- 0.0031279488524124984
Precision: 0.8638400485664544, +/- 0.0027200696694857935
Recall: 0.874, +/- 0.003794733192202085
Auroc: 0.8674, +/- 0.0030099833886584673
top_value: 0.1
High-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:03<00:00,  1.04it/s]


Accuracy: 0.8630000000000001, +/- 0.0015491933384829716
F1: 0.8625889424806121, +/- 0.0016328261401224876
Precision: 0.8661081595956599, +/- 0.002239418998564068
Recall: 0.8603999999999999, +/- 0.003187475490101856
Auroc: 0.8630000000000001, +/- 0.0015491933384829716
top_value: 5
High-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.58it/s]


Accuracy: 0.8984, +/- 0.002135415650406234
F1: 0.8997423292141432, +/- 0.0019973482424223044
Precision: 0.8894098700909356, +/- 0.0031586596062096596
Recall: 0.9107999999999998, +/- 0.0018547236990991446
Auroc: 0.8984, +/- 0.002135415650406234
top_value: 1
High-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.45it/s]


Accuracy: 0.9686, +/- 0.0006782329983125471
F1: 0.9686000928108032, +/- 0.0006906471367077056
Precision: 0.9682911488988198, +/- 0.0012836513066793068
Recall: 0.9692000000000001, +/- 0.0013564659966250482
Auroc: 0.9686, +/- 0.0006782329983125471
top_value: 0.01
High-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:03<00:00,  1.01it/s]


Accuracy: 0.8876000000000002, +/- 0.0033256578296631655
F1: 0.8885485335652008, +/- 0.0033947391411545285
Precision: 0.8814863556694432, +/- 0.0030012668617170795
Recall: 0.8964000000000001, +/- 0.004534313619501851
Auroc: 0.8876000000000002, +/- 0.0033256578296631655
top_value: 0.1
High-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:23<00:00,  5.98s/it]


Accuracy: 0.7807999999999999, +/- 0.0036110940170535418
F1: 0.7764967117189239, +/- 0.0036967166653670666
Precision: 0.7918133920579777, +/- 0.0038662800459269312
Recall: 0.7624000000000001, +/- 0.0037094473981982724
Auroc: 0.7807999999999999, +/- 0.0036110940170535413
top_value: 100
High-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:23<00:00,  5.80s/it]


Accuracy: 0.7936, +/- 0.004202380277890136
F1: 0.7874688823678329, +/- 0.004340886404302301
Precision: 0.811475476483416, +/- 0.004684251421093997
Recall: 0.766, +/- 0.004427188724235735
Auroc: 0.7936, +/- 0.004202380277890106
top_value: 50
High-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:23<00:00,  5.76s/it]


Accuracy: 0.8081999999999999, +/- 0.00293938769133976
F1: 0.807548699316774, +/- 0.0025916737050808232
Precision: 0.8116240742065102, +/- 0.00415597817944437
Recall: 0.8051999999999999, +/- 0.002416609194718886
Auroc: 0.8082, +/- 0.002939387691339819
top_value: 50
High-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:16<00:00,  4.23s/it]


Accuracy: 0.9673999999999999, +/- 0.0005099019513592942
F1: 0.9674439133332011, +/- 0.00047821444416308763
Precision: 0.967198817042736, +/- 0.0008886741273385014
Recall: 0.968, +/- 0.0
Auroc: 0.9673999999999999, +/- 0.0005099019513592942
top_value: 100
High-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:22<00:00,  5.61s/it]


Accuracy: 0.7811999999999999, +/- 0.0028705400188814676
F1: 0.7803609719977922, +/- 0.003248248234473873
Precision: 0.783100687942746, +/- 0.002186951011999653
Recall: 0.7783999999999999, +/- 0.00462168800331653
Auroc: 0.7812, +/- 0.002870540018881442
top_value: 100
High-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:45<00:00, 41.48s/it]


Accuracy: 0.7767999999999999, +/- 0.006865857557508741
F1: 0.7775150778500619, +/- 0.0070428277305649096
Precision: 0.7768085206840617, +/- 0.007042122864593422
Recall: 0.78, +/- 0.007949842765740735
Auroc: 0.7767999999999999, +/- 0.006865857557508741
top_value: 100
High-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:46<00:00, 41.55s/it]


Accuracy: 0.7806000000000001, +/- 0.003458323293158104
F1: 0.7823204288272401, +/- 0.0035009705432363708
Precision: 0.777697782125936, +/- 0.0041322363089752725
Recall: 0.7888, +/- 0.0056071383075504575
Auroc: 0.7806000000000001, +/- 0.0034583232931581
top_value: 100
High-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:46<00:00, 41.60s/it]


Accuracy: 0.8395999999999999, +/- 0.004707440918375944
F1: 0.8418229064238603, +/- 0.004698104774892009
Precision: 0.8307320318468878, +/- 0.00648231203029491
Recall: 0.8547999999999998, +/- 0.00826075057122533
Auroc: 0.8395999999999999, +/- 0.004707440918375944
top_value: 100
High-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:46<00:00, 41.75s/it]


Accuracy: 0.9634, +/- 0.001568438714135817
F1: 0.963303378854308, +/- 0.0015764156533456548
Precision: 0.9651381861016214, +/- 0.0023426631018089983
Recall: 0.962, +/- 0.0016733200530681658
Auroc: 0.9634, +/- 0.001568438714135817
top_value: 25
High-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:46<00:00, 41.65s/it]


Accuracy: 0.7928000000000001, +/- 0.006522269543648137
F1: 0.7940057832781646, +/- 0.007362579982449268
Precision: 0.7882340564528585, +/- 0.0041539239276135505
Recall: 0.8008000000000001, +/- 0.011128342194594838
Auroc: 0.7928, +/- 0.006522269543648144
top_value: 100
High-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:09<00:00,  2.26s/it]


Accuracy: 0.8156000000000001, +/- 0.0029257477676655416
F1: 0.8191963680017018, +/- 0.0033869949687400634
Precision: 0.8041270961654877, +/- 0.0011852262326566157
Recall: 0.8360000000000001, +/- 0.005932958789676482
Auroc: 0.8155999999999999, +/- 0.002925747767665547
top_value: 1
High-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:08<00:00,  2.02s/it]


Accuracy: 0.8154, +/- 0.002638181191654581
F1: 0.8157429374330454, +/- 0.0025413000771111094
Precision: 0.8154875596301778, +/- 0.0038913414284277095
Recall: 0.8171999999999999, +/- 0.003382306905057532
Auroc: 0.8154, +/- 0.0026381811916545827
top_value: 5
High-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:08<00:00,  2.19s/it]


Accuracy: 0.8692, +/- 0.002059126028197393
F1: 0.8719321830644582, +/- 0.0020813680373721576
Precision: 0.8557768044801006, +/- 0.0027897245874007686
Recall: 0.8896, +/- 0.002638181191654586
Auroc: 0.8692, +/- 0.0020591260281974254
top_value: 5
High-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:05<00:00,  1.32s/it]


Accuracy: 0.9681999999999998, +/- 0.00037416573867739446
F1: 0.968140654349264, +/- 0.000382231195963612
Precision: 0.9693820642420583, +/- 0.00040807272048703087
Recall: 0.9672000000000001, +/- 0.0004898979485566361
Auroc: 0.9681999999999998, +/- 0.00037416573867739446
top_value: 1
High-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:08<00:00,  2.03s/it]


Accuracy: 0.8326, +/- 0.0017204650534085254
F1: 0.8355223745498017, +/- 0.0018042539079252892
Precision: 0.8226319335858658, +/- 0.0018954736282500847
Recall: 0.8496, +/- 0.0031240998703626756
Auroc: 0.8326, +/- 0.0017204650534085356
top_value: 5
High-N, LazDEF-task, Vanilla-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [01:00<00:00, 15.02s/it]


Accuracy: 0.8124, +/- 0.001536229149573728
F1: 0.813035639013529, +/- 0.001434023228979691
Precision: 0.8102221639257365, +/- 0.002341828194106894
Recall: 0.8168, +/- 0.0013564659966250417
Auroc: 0.8124, +/- 0.0015362291495737092
top_value: 100
High-N, LazDEF-task, Peptide-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:59<00:00, 14.85s/it]


Accuracy: 0.836, +/- 0.0028106938645110296
F1: 0.836674447857483, +/- 0.002925896790018023
Precision: 0.8324032508979305, +/- 0.0026181848363812563
Recall: 0.842, +/- 0.0037416573867739447
Auroc: 0.836, +/- 0.0028106938645110417
top_value: 200
High-N, LazDEF-task, LazBF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:57<00:00, 14.28s/it]


Accuracy: 0.8821999999999999, +/- 0.0035128336140500785
F1: 0.8841173586879236, +/- 0.0035384634801531125
Precision: 0.8698892979033378, +/- 0.0031098122535295624
Recall: 0.8996000000000001, +/- 0.005075431016179813
Auroc: 0.8821999999999999, +/- 0.0035128336140500655
top_value: 100
High-N, LazDEF-task, LazDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:35<00:00,  8.86s/it]


Accuracy: 0.9642, +/- 0.0010677078252031371
F1: 0.9641790036658147, +/- 0.0010439184046972404
Precision: 0.9644957351091954, +/- 0.001345391987038673
Recall: 0.9644, +/- 0.000748331477354771
Auroc: 0.9642, +/- 0.0010677078252031278
top_value: 50
High-N, LazDEF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [01:20<00:00, 20.22s/it]


Accuracy: 0.85, +/- 0.0017888543819998114
F1: 0.8508347946528699, +/- 0.0021987509652480622
Precision: 0.845059364858969, +/- 0.0017483973386930212
Recall: 0.8580000000000002, +/- 0.0051380930314660535
Auroc: 0.85, +/- 0.00178885438199983
top_value: 50
[[0.8674, 0.7807999999999999, 0.7767999999999999, 0.8156000000000001, 0.8124], [0.8630000000000001, 0.7936, 0.7806000000000001, 0.8154, 0.836], [0.8984, 0.8081999999999999, 0.8395999999999999, 0.8692, 0.8821999999999999], [0.9686, 0.9673999999999999, 0.9634, 0.9681999999999998, 0.9642], [0.8876000000000002, 0.7811999999999999, 0.7928000000000001, 0.8326, 0.85]]
[[0.0030099833886584673, 0.0036110940170535418, 0.006865857557508741, 0.0029257477676655416, 0.001536229149573728], [0.0015491933384829716, 0.004202380277890136, 0.003458323293158104, 0.002638181191654581, 0.0028106938645110296], [0.002135415650406234, 0.00293938769133976, 0.004707440918375944, 0.002059126028197393, 0.0035128336140500785], [0.0006782329983125471, 0.000509901951359

In [16]:
#@title LazBF LowN

np.random.seed(42)
random.seed(42)
from sklearn.utils import check_random_state
random_state = check_random_state(42)

idxs = balanced_sample_np(lazbf_mlm_none, LazBF_sample_labels, 200, 42)
y_values = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]
y_errors = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]

i = 0
for model, param, grid in zip(model_list, param_list, value_list):

  # Lazbf prediction
  print(f"Low-N, LazBF-task, Vanilla-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_none[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[0][i] = m
  y_errors[0][i] = e

  print(f"Low-N, LazBF-task, Peptide-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_pa[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[1][i] = m
  y_errors[1][i] = e

  print(f"Low-N, LazBF-task, LazBF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazbf[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[2][i] = m
  y_errors[2][i] = e

  print(f"Low-N, LazBF-task, LazDEF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazdef[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[3][i] = m
  y_errors[3][i] = e

  print(f"Low-N, LazBF-task, LazBCDEF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazbcdef[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[4][i] = m
  y_errors[4][i] = e

  i += 1
print(y_values)
print(y_errors)

Low-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.62it/s]


Accuracy: 0.868, +/- 0.0025495097567963944
F1: 0.8670443279304969, +/- 0.003446128622598829
Precision: 0.8709614546324735, +/- 0.0034416746576243383
Recall: 0.868, +/- 0.005830951894845304
Auroc: 0.868, +/- 0.0025495097567963944
top_value: 0.1
Low-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.63it/s]


Accuracy: 0.877, +/- 0.005830951894845287
F1: 0.8785867750830583, +/- 0.005697179544604169
Precision: 0.8728210493295162, +/- 0.005779407601436075
Recall: 0.89, +/- 0.007071067811865473
Auroc: 0.877, +/- 0.005830951894845287
top_value: 0.01
Low-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:00<00:00,  5.60it/s]


Accuracy: 0.9789999999999999, +/- 0.0010000000000000009
F1: 0.9793597784329492, +/- 0.0009999410398611218
Precision: 0.9715064935064935, +/- 0.00021748669588018133
Recall: 0.9880000000000001, +/- 0.0019999999999999797
Auroc: 0.9789999999999999, +/- 0.0010000000000000009
top_value: 0.01
Low-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:00<00:00,  4.84it/s]


Accuracy: 0.8880000000000001, +/- 0.0043588989435406605
F1: 0.8846921459617778, +/- 0.0042167688550813895
Precision: 0.9039103935653943, +/- 0.005049244988643448
Recall: 0.876, +/- 0.0024494897427831982
Auroc: 0.8879999999999999, +/- 0.004358898943540665
top_value: 0.01
Low-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:00<00:00,  4.21it/s]


Accuracy: 0.8629999999999999, +/- 0.007176350047203658
F1: 0.8658243698564922, +/- 0.007230850332972614
Precision: 0.8538835989842853, +/- 0.007657905510559336
Recall: 0.8799999999999999, +/- 0.007745966692414842
Auroc: 0.8629999999999999, +/- 0.007176350047203631
top_value: 5
Low-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:03<00:00,  1.01it/s]


Accuracy: 0.86, +/- 0.005700877125495719
F1: 0.8585768817823032, +/- 0.005810703539850774
Precision: 0.8712044367292717, +/- 0.00884671286933755
Recall: 0.85, +/- 0.006324555320336721
Auroc: 0.86, +/- 0.005700877125495709
top_value: 100
Low-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:03<00:00,  1.03it/s]


Accuracy: 0.867, +/- 0.00644204936336258
F1: 0.8659342065031531, +/- 0.006843183001109756
Precision: 0.8745999111939347, +/- 0.003742337166881564
Recall: 0.866, +/- 0.011661903789690627
Auroc: 0.867, +/- 0.006442049363362557
top_value: 100
Low-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:02<00:00,  1.69it/s]


Accuracy: 0.977, +/- 0.0012247448713916173
F1: 0.9773526311087286, +/- 0.0012769710805249092
Precision: 0.9717575757575758, +/- 0.0002585107987330409
Recall: 0.984, +/- 0.0024494897427831527
Auroc: 0.977, +/- 0.0012247448713916173
top_value: 50
Low-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:03<00:00,  1.03it/s]


Accuracy: 0.8880000000000001, +/- 0.008154753215150053
F1: 0.8875112924166635, +/- 0.008384356991637863
Precision: 0.8968473557297759, +/- 0.009455595157047246
Recall: 0.884, +/- 0.011661903789690599
Auroc: 0.8880000000000001, +/- 0.008154753215150043
top_value: 50
Low-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:05<00:00,  1.37s/it]


Accuracy: 0.8260000000000002, +/- 0.004301162633521319
F1: 0.8312087829521058, +/- 0.004326145382037029
Precision: 0.8191512273565712, +/- 0.00609117970722491
Recall: 0.85, +/- 0.005477225575051626
Auroc: 0.826, +/- 0.004301162633521347
top_value: 100
Low-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:32<00:00,  8.07s/it]


Accuracy: 0.842, +/- 0.010074720839804983
F1: 0.8411465650455412, +/- 0.010395206582028992
Precision: 0.8447360787460397, +/- 0.010238557518801539
Recall: 0.842, +/- 0.01157583690279023
Auroc: 0.842, +/- 0.010074720839804983
top_value: 100
Low-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:31<00:00,  7.99s/it]


Accuracy: 0.8320000000000001, +/- 0.010793516572461468
F1: 0.8299022755223012, +/- 0.011147815061907184
Precision: 0.8421461318179585, +/- 0.011172993690532105
Recall: 0.8239999999999998, +/- 0.015033296378372916
Auroc: 0.8320000000000001, +/- 0.010793516572461468
top_value: 25
Low-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:20<00:00,  5.13s/it]


Accuracy: 0.977, +/- 0.00339116499156265
F1: 0.9769897595263449, +/- 0.003437977751788613
Precision: 0.9751934381408066, +/- 0.0038345395300867723
Recall: 0.9799999999999999, +/- 0.004472135954999583
Auroc: 0.977, +/- 0.00339116499156265
top_value: 50
Low-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:32<00:00,  8.08s/it]


Accuracy: 0.8619999999999999, +/- 0.008455767262643877
F1: 0.8625645131269563, +/- 0.008999008236859752
Precision: 0.8591565470992716, +/- 0.0068893272930334905
Recall: 0.874, +/- 0.011661903789690628
Auroc: 0.8619999999999999, +/- 0.008455767262643877
top_value: 50
Low-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [00:31<00:00,  7.91s/it]


Accuracy: 0.8160000000000001, +/- 0.006204836822995421
F1: 0.8194419508791231, +/- 0.0076726367913506855
Precision: 0.8042340026883572, +/- 0.0036376207160101004
Recall: 0.842, +/- 0.014628738838327776
Auroc: 0.8160000000000001, +/- 0.006204836822995421
top_value: 25
Low-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  6.17it/s]


Accuracy: 0.8639999999999999, +/- 0.005787918451395127
F1: 0.8640750486912232, +/- 0.006179497447713249
Precision: 0.8701733361925312, +/- 0.007917939243377213
Recall: 0.8639999999999999, +/- 0.007483314773547874
Auroc: 0.8639999999999999, +/- 0.0057879184513951265
top_value: 1
Low-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  6.11it/s]


Accuracy: 0.8610000000000001, +/- 0.012589678312014184
F1: 0.864858613059327, +/- 0.01202082282946016
Precision: 0.8500219961409892, +/- 0.010342425382819063
Recall: 0.884, +/- 0.013638181696985828
Auroc: 0.8610000000000001, +/- 0.012589678312014184
top_value: 1
Low-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  6.10it/s]


Accuracy: 0.9800000000000001, +/- 0.0
F1: 0.9803820244795853, +/- 5.639964592608179e-05
Precision: 0.9715670995670995, +/- 0.0002655906779251846
Recall: 0.99, +/- 0.0
Auroc: 0.9800000000000001, +/- 0.0
top_value: 1
Low-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  4.49it/s]


Accuracy: 0.89, +/- 0.002738612787525823
F1: 0.8881966131170538, +/- 0.0025390073470060994
Precision: 0.9071141486204078, +/- 0.00581004452843599
Recall: 0.876, +/- 0.0024494897427832165
Auroc: 0.89, +/- 0.002738612787525823
top_value: 0.01
Low-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:00<00:00,  4.49it/s]


Accuracy: 0.8629999999999999, +/- 0.006633249580710811
F1: 0.8657207805385021, +/- 0.0064234864883035495
Precision: 0.8540190232345839, +/- 0.006900847899873063
Recall: 0.884, +/- 0.006782329983125237
Auroc: 0.8629999999999999, +/- 0.00663324958071083
top_value: 5
Low-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:06<00:00,  1.70s/it]


Accuracy: 0.8550000000000001, +/- 0.0038729833462073987
F1: 0.8558974774097322, +/- 0.004300674721658778
Precision: 0.8578119783687693, +/- 0.006786580798220392
Recall: 0.8619999999999999, +/- 0.008000000000000002
Auroc: 0.8550000000000001, +/- 0.0038729833462073987
top_value: 100
Low-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


Accuracy: 0.8450000000000001, +/- 0.006519202405202654
F1: 0.8454836126571481, +/- 0.005922052408551657
Precision: 0.842619339624993, +/- 0.012016379758353346
Recall: 0.8540000000000001, +/- 0.006782329983125272
Auroc: 0.845, +/- 0.006519202405202628
top_value: 500
Low-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:08<00:00,  2.12s/it]


Accuracy: 0.978, +/- 0.0012247448713916173
F1: 0.9783587331258466, +/- 0.0013920575120741714
Precision: 0.9721765480895916, +/- 0.00046242098424746376
Recall: 0.9860000000000001, +/- 0.0024494897427831527
Auroc: 0.978, +/- 0.0012247448713916173
top_value: 100
Low-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:09<00:00,  2.32s/it]


Accuracy: 0.8859999999999999, +/- 0.004301162633521326
F1: 0.8868662225051753, +/- 0.004462867216688891
Precision: 0.8863641465323386, +/- 0.004254557901996115
Recall: 0.8939999999999999, +/- 0.008124038404635967
Auroc: 0.8859999999999999, +/- 0.004301162633521326
top_value: 500
Low-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:12<00:00,  3.05s/it]


Accuracy: 0.865, +/- 0.004472135954999583
F1: 0.8639673769883043, +/- 0.0037988564765372556
Precision: 0.8752785713411638, +/- 0.008077162863184929
Recall: 0.8619999999999999, +/- 0.0037416573867739447
Auroc: 0.865, +/- 0.004472135954999558
top_value: 100
[[0.868, 0.86, 0.842, 0.8639999999999999, 0.8550000000000001], [0.877, 0.867, 0.8320000000000001, 0.8610000000000001, 0.8450000000000001], [0.9789999999999999, 0.977, 0.977, 0.9800000000000001, 0.978], [0.8880000000000001, 0.8880000000000001, 0.8619999999999999, 0.89, 0.8859999999999999], [0.8629999999999999, 0.8260000000000002, 0.8160000000000001, 0.8629999999999999, 0.865]]
[[0.0025495097567963944, 0.005700877125495719, 0.010074720839804983, 0.005787918451395127, 0.0038729833462073987], [0.005830951894845287, 0.00644204936336258, 0.010793516572461468, 0.012589678312014184, 0.006519202405202654], [0.0010000000000000009, 0.0012247448713916173, 0.00339116499156265, 0.0, 0.0012247448713916173], [0.0043588989435406605, 0.0081547532151500

In [17]:
#@title LazBF MedN

np.random.seed(1)
random.seed(1)
from sklearn.utils import check_random_state
random_state = check_random_state(1)

idxs = balanced_sample_np(lazbf_mlm_none, LazBF_sample_labels, 500, 1)
y_values = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]
y_errors = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]

i = 0
for model, param, grid in zip(model_list, param_list, value_list):

  # Lazbf prediction
  print(f"Med-N, LazBF-task, Vanilla-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_none[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[0][i] = m
  y_errors[0][i] = e

  print(f"Med-N, LazBF-task, Peptide-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_pa[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[1][i] = m
  y_errors[1][i] = e

  print(f"Med-N, LazBF-task, LazBF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazbf[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[2][i] = m
  y_errors[2][i] = e

  print(f"Med-N, LazBF-task, LazDEF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazdef[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[3][i] = m
  y_errors[3][i] = e

  print(f"Med-N, LazBF-task, LazBCDEF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazbcdef[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[4][i] = m
  y_errors[4][i] = e

  i += 1
print(y_values)
print(y_errors)

Med-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.49it/s]


Accuracy: 0.884, +/- 0.0033466401061363346
F1: 0.8849959757565282, +/- 0.0032578484526496897
Precision: 0.8796273411379811, +/- 0.0030360490960744265
Recall: 0.8927999999999999, +/- 0.0040792156108742155
Auroc: 0.884, +/- 0.0033466401061363247
top_value: 0.1
Med-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.74it/s]


Accuracy: 0.8907999999999999, +/- 0.00313687742827163
F1: 0.8922697865484401, +/- 0.0035990666951387445
Precision: 0.8819978518271281, +/- 0.0018636179793203123
Recall: 0.9048, +/- 0.006374950980203711
Auroc: 0.8907999999999999, +/- 0.00313687742827163
top_value: 0.01
Med-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  3.38it/s]


Accuracy: 0.9551999999999999, +/- 0.0013564659966250401
F1: 0.9552642932258364, +/- 0.0013606774950423303
Precision: 0.9539362042810373, +/- 0.0016600017114114351
Recall: 0.9575999999999999, +/- 0.0009797958971132991
Auroc: 0.9551999999999999, +/- 0.0013564659966250284
top_value: 1
Med-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Accuracy: 0.8832000000000001, +/- 0.0036660605559646585
F1: 0.8842314132696523, +/- 0.0037864142111892238
Precision: 0.8771916823221353, +/- 0.0037690998318461793
Recall: 0.8928, +/- 0.005713142742834288
Auroc: 0.8832000000000001, +/- 0.0036660605559646585
top_value: 0.1
Med-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.32it/s]


Accuracy: 0.8876000000000002, +/- 0.002039607805437097
F1: 0.8893826998808503, +/- 0.0019864886321973004
Precision: 0.8760370808781313, +/- 0.0023594899406426157
Recall: 0.9040000000000001, +/- 0.003577708763999679
Auroc: 0.8876000000000002, +/- 0.002039607805437073
top_value: 0.01
Med-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:10<00:00,  2.56s/it]


Accuracy: 0.8535999999999999, +/- 0.0024819347291981904
F1: 0.8527863025309497, +/- 0.0029774204419236805
Precision: 0.8580160730247858, +/- 0.0034290992919895997
Recall: 0.8488000000000001, +/- 0.00674091981854109
Auroc: 0.8535999999999999, +/- 0.0024819347291981904
top_value: 50
Med-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:09<00:00,  2.43s/it]


Accuracy: 0.8564, +/- 0.0023151673805580607
F1: 0.8575511710900322, +/- 0.0021446428441950407
Precision: 0.853738219726327, +/- 0.0037765851218080747
Recall: 0.8632, +/- 0.003440930106817041
Auroc: 0.8564, +/- 0.002315167380558034
top_value: 50
Med-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:07<00:00,  1.89s/it]


Accuracy: 0.9655999999999999, +/- 0.000748331477354789
F1: 0.9658796581352093, +/- 0.0007703525203696139
Precision: 0.9605133223709554, +/- 0.0007617147220489028
Recall: 0.9719999999999999, +/- 0.0012649110640673528
Auroc: 0.9655999999999999, +/- 0.000748331477354789
top_value: 50
Med-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:10<00:00,  2.60s/it]


Accuracy: 0.8824, +/- 0.0041665333311999364
F1: 0.8836652553571355, +/- 0.003964220637398365
Precision: 0.8774115226030503, +/- 0.005737905749924157
Recall: 0.8912000000000001, +/- 0.0026532998322843153
Auroc: 0.8824, +/- 0.0041665333311999364
top_value: 100
Med-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:09<00:00,  2.47s/it]


Accuracy: 0.8392, +/- 0.004498888751680801
F1: 0.837717158822532, +/- 0.0052434428276063195
Precision: 0.8448703431127026, +/- 0.0035815321383673086
Recall: 0.8328, +/- 0.007525955088890723
Auroc: 0.8392, +/- 0.0044988887516808
top_value: 100
Med-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:21<00:00, 20.35s/it]


Accuracy: 0.8619999999999999, +/- 0.002756809750418063
F1: 0.8620716135123182, +/- 0.002817428671271872
Precision: 0.8643211879829333, +/- 0.003086436627236086
Recall: 0.8624, +/- 0.0032496153618543815
Auroc: 0.8619999999999999, +/- 0.002756809750418063
top_value: 100
Med-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:20<00:00, 20.06s/it]


Accuracy: 0.8628, +/- 0.005535341001239233
F1: 0.863584603673968, +/- 0.00565389745644853
Precision: 0.8576024716760466, +/- 0.006276923189741422
Recall: 0.8728000000000001, +/- 0.008333066662399824
Auroc: 0.8628, +/- 0.005535341001239213
top_value: 100
Med-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:20<00:00, 20.19s/it]


Accuracy: 0.9576, +/- 0.0016
F1: 0.9578300810606161, +/- 0.0015051828253753454
Precision: 0.9540225985971349, +/- 0.003406821383652608
Recall: 0.9623999999999999, +/- 0.0027129319932501193
Auroc: 0.9575999999999999, +/- 0.0015999999999999983
top_value: 100
Med-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:20<00:00, 20.07s/it]


Accuracy: 0.8695999999999999, +/- 0.0036551333764994545
F1: 0.8702180076157735, +/- 0.0035860442674137526
Precision: 0.8672723920959896, +/- 0.004099073899139173
Recall: 0.876, +/- 0.004560701700396556
Auroc: 0.8695999999999999, +/- 0.0036551333764994545
top_value: 100
Med-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [01:28<00:00, 22.22s/it]


Accuracy: 0.8448, +/- 0.003136877428271627
F1: 0.8437221249914284, +/- 0.0028638177760561276
Precision: 0.8495370070787216, +/- 0.005425261209900335
Recall: 0.8400000000000001, +/- 0.005366563145999504
Auroc: 0.8447999999999999, +/- 0.0031368774282716175
top_value: 50
Med-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.36it/s]


Accuracy: 0.8752000000000001, +/- 0.003136877428271616
F1: 0.8759507359424388, +/- 0.003188755163288221
Precision: 0.8708451571879587, +/- 0.0019240684989765373
Recall: 0.8831999999999999, +/- 0.004799999999999999
Auroc: 0.8752000000000001, +/- 0.0031368774282716
top_value: 1
Med-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.72it/s]


Accuracy: 0.8868, +/- 0.004029888335922005
F1: 0.8898033041552802, +/- 0.004006274426115725
Precision: 0.8691435985188514, +/- 0.004077278382578509
Recall: 0.9136, +/- 0.0065238025721200245
Auroc: 0.8868, +/- 0.004029888335921991
top_value: 1
Med-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:01<00:00,  2.57it/s]


Accuracy: 0.9648, +/- 0.0004898979485566133
F1: 0.965166943467619, +/- 0.0004994511329496562
Precision: 0.9571751017968871, +/- 0.00014568687223160808
Recall: 0.9735999999999999, +/- 0.0009797958971132993
Auroc: 0.9648, +/- 0.0004898979485566133
top_value: 1
Med-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


Accuracy: 0.8964000000000001, +/- 0.0019390719429665175
F1: 0.8977168070770597, +/- 0.0018693398531100325
Precision: 0.8854950720133532, +/- 0.0029073463751919677
Recall: 0.9112, +/- 0.0026532998322843153
Auroc: 0.8964000000000001, +/- 0.0019390719429665515
top_value: 5
Med-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:02<00:00,  1.83it/s]


Accuracy: 0.8608, +/- 0.002059126028197386
F1: 0.8618050184954628, +/- 0.0022474766091611555
Precision: 0.8585536148701858, +/- 0.0018841601190155248
Recall: 0.868, +/- 0.004381780460041333
Auroc: 0.8608, +/- 0.0020591260281973933
top_value: 5
Med-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:21<00:00,  5.45s/it]


Accuracy: 0.8756, +/- 0.004019950248448349
F1: 0.8777579060859644, +/- 0.003931234747984012
Precision: 0.8650736649240842, +/- 0.005029022193642942
Recall: 0.8928, +/- 0.004270831300812503
Auroc: 0.8756, +/- 0.004019950248448327
top_value: 50
Med-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:23<00:00,  5.83s/it]


Accuracy: 0.8968, +/- 0.003979949748426483
F1: 0.8972898347410837, +/- 0.004236228095260224
Precision: 0.8918302914173657, +/- 0.003648754558135808
Recall: 0.9048, +/- 0.005425863986500227
Auroc: 0.8968, +/- 0.003979949748426481
top_value: 50
Med-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:23<00:00,  5.86s/it]


Accuracy: 0.9564, +/- 0.0017204650534085254
F1: 0.9565348441945604, +/- 0.0017881323893264565
Precision: 0.9530534438761118, +/- 0.0013158536232715213
Recall: 0.9608000000000001, +/- 0.002332380757938145
Auroc: 0.9564, +/- 0.0017204650534085202
top_value: 50
Med-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:20<00:00,  5.17s/it]


Accuracy: 0.89, +/- 0.001549193338482968
F1: 0.8915861807690056, +/- 0.0013091236650251226
Precision: 0.8797544761937022, +/- 0.002715647043208811
Recall: 0.9056, +/- 0.003249615361854403
Auroc: 0.89, +/- 0.001549193338482968
top_value: 200
Med-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:32<00:00,  8.25s/it]


Accuracy: 0.8879999999999999, +/- 0.0014142135623731041
F1: 0.8898682366094087, +/- 0.0015469879169736978
Precision: 0.8768251141137651, +/- 0.0015809451750058356
Recall: 0.9047999999999998, +/- 0.0026532998322842923
Auroc: 0.8879999999999999, +/- 0.0014142135623731041
top_value: 500
[[0.884, 0.8535999999999999, 0.8619999999999999, 0.8752000000000001, 0.8756], [0.8907999999999999, 0.8564, 0.8628, 0.8868, 0.8968], [0.9551999999999999, 0.9655999999999999, 0.9576, 0.9648, 0.9564], [0.8832000000000001, 0.8824, 0.8695999999999999, 0.8964000000000001, 0.89], [0.8876000000000002, 0.8392, 0.8448, 0.8608, 0.8879999999999999]]
[[0.0033466401061363346, 0.0024819347291981904, 0.002756809750418063, 0.003136877428271616, 0.004019950248448349], [0.00313687742827163, 0.0023151673805580607, 0.005535341001239233, 0.004029888335922005, 0.003979949748426483], [0.0013564659966250401, 0.000748331477354789, 0.0016, 0.0004898979485566133, 0.0017204650534085254], [0.0036660605559646585, 0.0041665333311999364, 

In [18]:
#@title LazBF HighN

np.random.seed(42)
random.seed(42)
from sklearn.utils import check_random_state
random_state = check_random_state(42)

idxs = balanced_sample_np(lazbf_mlm_none, LazBF_sample_labels, 1000, 42)
y_values = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]
y_errors = [
    [0, 0, 0, 0, 0], # vanilla-esm
    [0, 0, 0, 0, 0], # peptide-esm
    [0, 0, 0, 0, 0], # LazBF-esm
    [0, 0, 0, 0, 0], # LazDEF-esm
    [0, 0, 0, 0, 0], # LazBCDEF-esm
]

i = 0
for model, param, grid in zip(model_list, param_list, value_list):

  # Lazbf prediction
  print(f"High-N, LazBF-task, Vanilla-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_none[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[0][i] = m
  y_errors[0][i] = e

  print(f"High-N, LazBF-task, Peptide-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_pa[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[1][i] = m
  y_errors[1][i] = e

  print(f"High-N, LazBF-task, LazBF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazbf[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[2][i] = m
  y_errors[2][i] = e

  print(f"High-N, LazBF-task, LazDEF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazdef[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[3][i] = m
  y_errors[3][i] = e

  print(f"High-N, LazBF-task, LazBCDEF-ESM-Embeddings {model}")
  m, e = optimize(lazbf_mlm_lazbcdef[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[4][i] = m
  y_errors[4][i] = e

  i += 1
print(y_values)
print(y_errors)

High-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


Accuracy: 0.9122, +/- 0.002853068523537426
F1: 0.9123142770697037, +/- 0.0030689397755547985
Precision: 0.9098615319591129, +/- 0.0014054768242513641
Recall: 0.9155999999999999, +/- 0.005564171097297396
Auroc: 0.9122, +/- 0.00285306852353744
top_value: 0.1
High-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.75it/s]


Accuracy: 0.9064, +/- 0.0019899748742132667
F1: 0.9066489715071766, +/- 0.0018201095750795325
Precision: 0.9061593685051095, +/- 0.003440587782922836
Recall: 0.908, +/- 0.0016733200530681658
Auroc: 0.9064, +/- 0.0019899748742132667
top_value: 1
High-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:01<00:00,  2.27it/s]


Accuracy: 0.9654, +/- 0.002111871208194274
F1: 0.9653943623920351, +/- 0.002126087084726765
Precision: 0.9673850338354286, +/- 0.0021987795893423545
Recall: 0.9639999999999999, +/- 0.002097617696340326
Auroc: 0.9654, +/- 0.002111871208194272
top_value: 0.1
High-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.63it/s]


Accuracy: 0.9074, +/- 0.001208304597359473
F1: 0.9077900121300738, +/- 0.0013011435442604417
Precision: 0.9037055911897139, +/- 0.0018864415893699762
Recall: 0.9124000000000001, +/- 0.0025612496949731574
Auroc: 0.9074, +/- 0.001208304597359473
top_value: 0.1
High-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.37it/s]


Accuracy: 0.9044000000000001, +/- 0.002561249694973126
F1: 0.9041666340437067, +/- 0.0028121703412102125
Precision: 0.9075929459289321, +/- 0.0006484531951696516
Recall: 0.9016000000000002, +/- 0.00563560112144215
Auroc: 0.9044000000000001, +/- 0.0025612496949731383
top_value: 0.1
High-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:22<00:00,  5.58s/it]


Accuracy: 0.8734, +/- 0.0013266499161422004
F1: 0.8715873520673956, +/- 0.0013940257692083957
Precision: 0.8843511555315953, +/- 0.0021724351921858383
Recall: 0.8604, +/- 0.003187475490101829
Auroc: 0.8734, +/- 0.0013266499161422004
top_value: 50
High-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:22<00:00,  5.51s/it]


Accuracy: 0.8797999999999998, +/- 0.0023108440016582576
F1: 0.878879985106012, +/- 0.002331126213282192
Precision: 0.8859154166841204, +/- 0.0021260419272556446
Recall: 0.8732000000000001, +/- 0.0028705400188814515
Auroc: 0.8797999999999998, +/- 0.0023108440016582576
top_value: 50
High-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:14<00:00,  3.52s/it]


Accuracy: 0.9704, +/- 0.0011224972160321864
F1: 0.9705548691794791, +/- 0.001110512221227998
Precision: 0.9669705029705599, +/- 0.001972955809201192
Recall: 0.9743999999999999, +/- 0.0007483314773548334
Auroc: 0.9704, +/- 0.0011224972160321864
top_value: 25
High-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:23<00:00,  5.86s/it]


Accuracy: 0.8912000000000001, +/- 0.0019595917942265683
F1: 0.8900609345691572, +/- 0.002021005152150445
Precision: 0.9002545279900097, +/- 0.0018040236552195506
Recall: 0.8808, +/- 0.0032619012860600257
Auroc: 0.8912000000000001, +/- 0.0019595917942265523
top_value: 100
High-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:22<00:00,  5.73s/it]


Accuracy: 0.8468, +/- 0.003455430508634187
F1: 0.8470402486898566, +/- 0.0030283207946557
Precision: 0.847100038358237, +/- 0.004658269961399754
Recall: 0.8479999999999999, +/- 0.002190890230020651
Auroc: 0.8468, +/- 0.0034554305086341637
top_value: 25
High-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:51<00:00, 42.99s/it]


Accuracy: 0.8699999999999999, +/- 0.0036055512754640186
F1: 0.8706805515385441, +/- 0.0037756637454408955
Precision: 0.8663513658758222, +/- 0.0033184948797659332
Recall: 0.8756, +/- 0.005706137047074845
Auroc: 0.8699999999999999, +/- 0.0036055512754639657
top_value: 100
High-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:51<00:00, 42.96s/it]


Accuracy: 0.875, +/- 0.004110960958218865
F1: 0.8750983252040235, +/- 0.004348290282427606
Precision: 0.8756736166735, +/- 0.0033389742881956286
Recall: 0.876, +/- 0.005830951894845278
Auroc: 0.875, +/- 0.004110960958218857
top_value: 50
High-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:46<00:00, 41.52s/it]


Accuracy: 0.9676, +/- 0.0005099019513592789
F1: 0.967612821707011, +/- 0.000497621121328262
Precision: 0.9688085595690387, +/- 0.0010856326001536189
Recall: 0.9667999999999999, +/- 0.0007999999999999813
Auroc: 0.9676, +/- 0.0005099019513592789
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:49<00:00, 42.32s/it]


Accuracy: 0.8866000000000002, +/- 0.0024207436873820622
F1: 0.8864818693265741, +/- 0.0026470540396581177
Precision: 0.8871913687454509, +/- 0.0021031916058054556
Recall: 0.8868, +/- 0.005499090833947031
Auroc: 0.8866000000000002, +/- 0.0024207436873820622
top_value: 100
High-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:48<00:00, 42.06s/it]


Accuracy: 0.8736, +/- 0.004081666326391734
F1: 0.8736057440108895, +/- 0.004153315372891776
Precision: 0.8735831505734302, +/- 0.005110964268921264
Recall: 0.8744, +/- 0.004707440918375928
Auroc: 0.8736, +/- 0.00408166632639172
top_value: 100
High-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:08<00:00,  2.06s/it]


Accuracy: 0.893, +/- 0.002345207879911743
F1: 0.8927784503453324, +/- 0.0025800404986656834
Precision: 0.894810986793091, +/- 0.002067722800367217
Recall: 0.8916000000000001, +/- 0.004749736834815181
Auroc: 0.893, +/- 0.0023452078799117
top_value: 5
High-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:08<00:00,  2.00s/it]


Accuracy: 0.9011999999999999, +/- 0.0012409673645990794
F1: 0.9012195483339172, +/- 0.0012826947063947697
Precision: 0.9006970607892677, +/- 0.00112180749939622
Recall: 0.9024000000000001, +/- 0.002315167380558035
Auroc: 0.9011999999999999, +/- 0.0012409673645990794
top_value: 1
High-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:04<00:00,  1.20s/it]


Accuracy: 0.9709999999999999, +/- 0.0012649110640673528
F1: 0.9712693097513277, +/- 0.0011745418125654588
Precision: 0.9666651624726106, +/- 0.002037320238145024
Recall: 0.9763999999999999, +/- 0.0004000000000000225
Auroc: 0.9709999999999999, +/- 0.0012649110640673615
top_value: 0.1
High-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:07<00:00,  1.75s/it]


Accuracy: 0.9182, +/- 0.0010198039027185426
F1: 0.9181575725913991, +/- 0.0009527831236561507
Precision: 0.9187239903021064, +/- 0.002542693036240734
Recall: 0.9179999999999999, +/- 0.0017888543819998394
Auroc: 0.9182, +/- 0.0010198039027185337
top_value: 5
High-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:06<00:00,  1.65s/it]


Accuracy: 0.8984, +/- 0.001435270009440735
F1: 0.8985136230357575, +/- 0.0014580604477654273
Precision: 0.8981537781529312, +/- 0.002368271839302182
Recall: 0.8995999999999998, +/- 0.0033105890714493786
Auroc: 0.8984, +/- 0.0014352700094407333
top_value: 5
High-N, LazBF-task, Vanilla-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:44<00:00, 11.18s/it]


Accuracy: 0.9048, +/- 0.0025573423705088838
F1: 0.904615221167837, +/- 0.002709385724785166
Precision: 0.9062688116799233, +/- 0.0019339033281525143
Recall: 0.9036, +/- 0.0040693979898751625
Auroc: 0.9047999999999998, +/- 0.0025573423705088877
top_value: 500
High-N, LazBF-task, Peptide-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:43<00:00, 10.87s/it]


Accuracy: 0.9072000000000001, +/- 0.0013564659966250564
F1: 0.907159714567069, +/- 0.001525085780441562
Precision: 0.9087478937689699, +/- 0.001823973319454379
Recall: 0.9064, +/- 0.0038157568056677977
Auroc: 0.9072000000000001, +/- 0.001356465996625022
top_value: 200
High-N, LazBF-task, LazBF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:38<00:00,  9.73s/it]


Accuracy: 0.9663999999999999, +/- 0.0010295630140987257
F1: 0.9660908517066185, +/- 0.0010402420988185354
Precision: 0.9743469625385739, +/- 0.0010189198441896367
Recall: 0.9583999999999999, +/- 0.0015999999999999944
Auroc: 0.9663999999999999, +/- 0.0010295630140987062
top_value: 50
High-N, LazBF-task, LazDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:36<00:00,  9.24s/it]


Accuracy: 0.9168000000000001, +/- 0.0024979991993593683
F1: 0.9172148394930002, +/- 0.002539230169658574
Precision: 0.9134149008242508, +/- 0.0023217390870711244
Recall: 0.9212, +/- 0.003382306905057566
Auroc: 0.9168000000000001, +/- 0.0024979991993593644
top_value: 100
High-N, LazBF-task, LazBCDEF-ESM-Embeddings <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [01:02<00:00, 15.62s/it]


Accuracy: 0.9038, +/- 0.0010198039027185535
F1: 0.9041152231421051, +/- 0.0010469190100800125
Precision: 0.9020250096067233, +/- 0.0022340621568664423
Recall: 0.9067999999999999, +/- 0.0030724582991474587
Auroc: 0.9038, +/- 0.0010198039027185513
top_value: 50
[[0.9122, 0.8734, 0.8699999999999999, 0.893, 0.9048], [0.9064, 0.8797999999999998, 0.875, 0.9011999999999999, 0.9072000000000001], [0.9654, 0.9704, 0.9676, 0.9709999999999999, 0.9663999999999999], [0.9074, 0.8912000000000001, 0.8866000000000002, 0.9182, 0.9168000000000001], [0.9044000000000001, 0.8468, 0.8736, 0.8984, 0.9038]]
[[0.002853068523537426, 0.0013266499161422004, 0.0036055512754640186, 0.002345207879911743, 0.0025573423705088838], [0.0019899748742132667, 0.0023108440016582576, 0.004110960958218865, 0.0012409673645990794, 0.0013564659966250564], [0.002111871208194274, 0.0011224972160321864, 0.0005099019513592789, 0.0012649110640673528, 0.0010295630140987257], [0.001208304597359473, 0.0019595917942265683, 0.002420743687382

---

In [13]:
#@title Comparing differnt learning rates on LazDEF task

np.random.seed(42)
random.seed(42)
from sklearn.utils import check_random_state
random_state = check_random_state(42)

idxs = balanced_sample_np(lazbf_mlm_none, LazBF_sample_labels, 1000, 42)
y_values = [
    [0, 0, 0, 0, 0], # lr=1e-4
    [0, 0, 0, 0, 0], # lr=1e-5
    [0, 0, 0, 0, 0], # lr=1e-6
]
y_errors = [
    [0, 0, 0, 0, 0], # lr=1e-4
    [0, 0, 0, 0, 0], # lr=1e-5
    [0, 0, 0, 0, 0], # lr=1e-6
]

i = 0
for model, param, grid in zip(model_list, param_list, value_list):

  # Lazbf prediction
  print(f"High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-4 {model}")
  m, e = optimize(lazbf_mlm_lazdefLR04[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[0][i] = m
  y_errors[0][i] = e

  print(f"High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-5 {model}")
  m, e = optimize(lazbf_mlm_lazdefLR05[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[1][i] = m
  y_errors[1][i] = e

  print(f"High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-6 {model}")
  m, e = optimize(lazbf_mlm_lazdef[idxs], LazBF_sample_labels[idxs], model, param, grid)
  y_values[2][i] = m
  y_errors[2][i] = e

  i += 1
print(y_values)
print(y_errors)

High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-4 <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:03<00:00,  1.15it/s]


Accuracy: 0.9065999999999999, +/- 0.0029765752132274704
F1: 0.9068797030368939, +/- 0.0031288426420821615
Precision: 0.9019473057258909, +/- 0.0032247339505695146
Recall: 0.9128000000000001, +/- 0.003878143885933043
Auroc: 0.9065999999999999, +/- 0.0029765752132274543
top_value: 5
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-5 <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:02<00:00,  1.67it/s]


Accuracy: 0.9109999999999999, +/- 0.0014142135623730885
F1: 0.9112659027815552, +/- 0.0014564150811609447
Precision: 0.9106168058372741, +/- 0.0019769143294044098
Recall: 0.9132, +/- 0.0026532998322843287
Auroc: 0.9110000000000001, +/- 0.0014142135623730961
top_value: 1
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-6 <class 'sklearn.linear_model._logistic.LogisticRegression'>


100%|██████████| 4/4 [00:04<00:00,  1.12s/it]


Accuracy: 0.9092, +/- 0.001319090595827293
F1: 0.9093837722653337, +/- 0.0014230978861112828
Precision: 0.9100973306204582, +/- 0.0022295460775350827
Recall: 0.9099999999999999, +/- 0.003346640106136328
Auroc: 0.9092, +/- 0.0013190905958273334
top_value: 0.01
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-4 <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:24<00:00,  6.16s/it]


Accuracy: 0.844, +/- 0.003209361307176273
F1: 0.8440019434536914, +/- 0.0032935772445183
Precision: 0.8451479956588079, +/- 0.003179130612015777
Recall: 0.8440000000000001, +/- 0.004000000000000006
Auroc: 0.844, +/- 0.0032093613071762545
top_value: 50
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-5 <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:22<00:00,  5.71s/it]


Accuracy: 0.8710000000000001, +/- 0.0014832396974191527
F1: 0.8705229547398847, +/- 0.0012450623716875927
Precision: 0.8757613729686403, +/- 0.0030857448741636157
Recall: 0.8664, +/- 0.0011661903789690628
Auroc: 0.8710000000000001, +/- 0.0014832396974190963
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-6 <class 'sklearn.ensemble._forest.RandomForestClassifier'>


100%|██████████| 4/4 [00:21<00:00,  5.49s/it]


Accuracy: 0.8942, +/- 0.0017720045146669308
F1: 0.8927739424138732, +/- 0.001935604653117629
Precision: 0.9042190287349421, +/- 0.002440862083893026
Recall: 0.8824, +/- 0.0042142615011410995
Auroc: 0.8942, +/- 0.001772004514666919
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-4 <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:48<00:00, 42.05s/it]


Accuracy: 0.8558, +/- 0.0023323807579380936
F1: 0.8556531371339556, +/- 0.002643715180166496
Precision: 0.8572086693861092, +/- 0.003972217884944395
Recall: 0.8555999999999999, +/- 0.006112282715974429
Auroc: 0.8558, +/- 0.0023323807579381226
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-5 <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:49<00:00, 42.33s/it]


Accuracy: 0.8802, +/- 0.004091454509095786
F1: 0.8812671830200637, +/- 0.0040105741232800045
Precision: 0.8743019914167105, +/- 0.0036986242215721163
Recall: 0.8888, +/- 0.004409081537009723
Auroc: 0.8802, +/- 0.004091454509095786
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-6 <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>


100%|██████████| 4/4 [02:49<00:00, 42.29s/it]


Accuracy: 0.8888000000000001, +/- 0.002782085548648707
F1: 0.8894756156170279, +/- 0.0028555896610663644
Precision: 0.8863421186141295, +/- 0.0027020705575364528
Recall: 0.8939999999999999, +/- 0.004335896677735743
Auroc: 0.8888, +/- 0.0027820855486487043
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-4 <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:08<00:00,  2.22s/it]


Accuracy: 0.8872, +/- 0.0011575836902790245
F1: 0.8872773220463396, +/- 0.0012155296417921736
Precision: 0.8856353118260932, +/- 0.0015992406166289565
Recall: 0.89, +/- 0.0029664793948382903
Auroc: 0.8872, +/- 0.0011575836902790323
top_value: 5
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-5 <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:07<00:00,  1.88s/it]


Accuracy: 0.8974000000000002, +/- 0.0020149441679610017
F1: 0.8984540822446778, +/- 0.0020442112931866306
Precision: 0.8893912158147602, +/- 0.0028664789471534407
Recall: 0.9088, +/- 0.0033823069050575518
Auroc: 0.8974, +/- 0.0020149441679610095
top_value: 5
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-6 <class 'sklearn.svm._classes.SVC'>


100%|██████████| 4/4 [00:07<00:00,  1.80s/it]


Accuracy: 0.9164, +/- 0.0038026306683663306
F1: 0.916456073906204, +/- 0.004018944908985036
Precision: 0.916098024812982, +/- 0.0025227473297119835
Recall: 0.9179999999999999, +/- 0.006164414002968956
Auroc: 0.9164, +/- 0.0038026306683663236
top_value: 5
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-4 <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:52<00:00, 13.23s/it]


Accuracy: 0.8974, +/- 0.001939071942966517
F1: 0.8982368600109268, +/- 0.0019017257632056685
Precision: 0.8916082645181372, +/- 0.0028671193010212487
Recall: 0.9056000000000001, +/- 0.002785677655436822
Auroc: 0.8973999999999999, +/- 0.0019390719429665082
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-5 <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:52<00:00, 13.09s/it]


Accuracy: 0.9078000000000002, +/- 0.000800000000000016
F1: 0.9085550254932281, +/- 0.00075957322345353
Precision: 0.9017223779339242, +/- 0.0014887848094103288
Recall: 0.9164, +/- 0.0013266499161421208
Auroc: 0.9078000000000002, +/- 0.000800000000000016
top_value: 100
High-N, LazBF-task, LazDEF-ESM-Embeddings LR=1e-6 <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>


100%|██████████| 4/4 [00:41<00:00, 10.35s/it]


Accuracy: 0.914, +/- 0.0031144823004794846
F1: 0.9145204274222815, +/- 0.0029779064149716957
Precision: 0.9096999381778458, +/- 0.005356944345437027
Recall: 0.9200000000000002, +/- 0.0016733200530681528
Auroc: 0.914, +/- 0.003114482300479499
top_value: 50
[[0.9065999999999999, 0.844, 0.8558, 0.8872, 0.8974], [0.9109999999999999, 0.8710000000000001, 0.8802, 0.8974000000000002, 0.9078000000000002], [0.9092, 0.8942, 0.8888000000000001, 0.9164, 0.914]]
[[0.0029765752132274704, 0.003209361307176273, 0.0023323807579380936, 0.0011575836902790245, 0.001939071942966517], [0.0014142135623730885, 0.0014832396974191527, 0.004091454509095786, 0.0020149441679610017, 0.000800000000000016], [0.001319090595827293, 0.0017720045146669308, 0.002782085548648707, 0.0038026306683663306, 0.0031144823004794846]]


In [25]:
#@title Zero-shot predictions with downstream models

# Lown N - LazBF -> LazDEF
def zero_shot_for_downstream_model(Xtr, ytr, X_test, y_test, train_size, MODEL):
  idxs = balanced_sample_np(Xtr, ytr, train_size, 1)
  X_train = Xtr[idxs]
  y_train = ytr[idxs]
  scaler=StandardScaler()
  X_train=scaler.fit_transform(X_train)
  X_test=scaler.transform(X_test)

  MODEL.fit(X_train, y_train)
  y_pred = MODEL.predict(X_test)
  print(accuracy_score(y_test, y_pred))

zero_shot_for_downstream_model(lazbf_mlm_none, LazBF_sample_labels, lazdef_mlm_none, LazDEF_sample_labels, 200, SVC(C=0.1))
zero_shot_for_downstream_model(lazbf_mlm_none, LazBF_sample_labels, lazdef_mlm_none, LazDEF_sample_labels, 500, RandomForestClassifier(n_estimators=50))
zero_shot_for_downstream_model(lazbf_mlm_none, LazBF_sample_labels, lazdef_mlm_none, LazDEF_sample_labels, 1000, SVC(C=1))

zero_shot_for_downstream_model(lazdef_mlm_none, LazDEF_sample_labels, lazbf_mlm_none, LazBF_sample_labels, 200, SVC(C=0.1))
zero_shot_for_downstream_model(lazdef_mlm_none, LazDEF_sample_labels, lazbf_mlm_none, LazBF_sample_labels, 500, SVC(C=0.1))
zero_shot_for_downstream_model(lazdef_mlm_none, LazDEF_sample_labels, lazbf_mlm_none, LazBF_sample_labels, 1000, LogisticRegression(C=1))

0.54218
0.58338
0.54668
0.70158
0.7214
0.70548


In [None]:
!pip install transformers[torch] evaluate datasets
!pip install -U kaleido
!pip install captum