In [65]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent.parent))
from sfm.data.prot_data.dataset import DownstreamLMDBDataset
from sfm.data.prot_data.vocalubary import Alphabet
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import lmdb
import scipy
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.multioutput import MultiOutputClassifier
# from sklearnex import patch_sklearn
# patch_sklearn()
# from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor

In [2]:
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

from itertools import product

class ProteinSequenceFingerprint:
    idx_to_tok = {v: k for k, v in Alphabet().tok_to_idx.items()}

    def reverse2str(self, tokens):
        aaseq = []
        for i in tokens:
            if i in [self.vocab.unk_idx, self.vocab.padding_idx, self.vocab.cls_idx, self.vocab.mask_idx, self.vocab.eos_idx,]:
                continue
            aaseq.append(self.idx_to_tok[i])
        return "".join(aaseq)

class ProteinkmerHistogram(ProteinSequenceFingerprint):
    def __init__(self, vocab, k):
        self.vocab = vocab
        self.k = k
        # only upper case letters, 25 tokens in total
        self.standard_toks = [i for i in vocab.standard_toks if i.isupper()]
        self.kmer2idx = {"".join(i): idx for idx, i in enumerate(product(self.standard_toks, repeat=k))}

    def __call__(self, tokens):
        seq = self.reverse2str(tokens)
        kmer_count = np.zeros(len(self.kmer2idx))
        for i in range(len(seq) - self.k + 1):
            kmer = seq[i:i+self.k]
            if kmer in self.kmer2idx:
                kmer_count[self.kmer2idx[kmer]] += 1
        return kmer_count

[[32m2024-01-05 04:28:59.426[0m][[36mINFO[0m]: Alphabet: {'<cls>': 0, '<pad>': 1, '<eos>': 2, '<unk>': 3, 'L': 4, 'A': 5, 'G': 6, 'V': 7, 'S': 8, 'E': 9, 'R': 10, 'T': 11, 'I': 12, 'D': 13, 'P': 14, 'K': 15, 'Q': 16, 'N': 17, 'F': 18, 'Y': 19, 'M': 20, 'H': 21, 'W': 22, 'C': 23, 'X': 24, 'B': 25, 'U': 26, 'Z': 27, 'O': 28, '.': 29, '-': 30, '<mask>': 31}


In [66]:
args = Namespace()
args.max_length = 2048
args.data_basepath = "/mnta/yaosen/data/bfm_benchmark"
args.task_name = "EnzymeCommission"
dataset_dict = DownstreamLMDBDataset.load_dataset(args)
trainset = dataset_dict["train"]
valset = dataset_dict["valid"]
# others are test sets
testset_dict = {
    k: v for k, v in dataset_dict.items() if k not in ["train", "valid"]
}

[[32m2024-01-05 05:30:47.452[0m][[36mINFO[0m]: Load EnzymeCommission train dataset from /mnta/yaosen/data/bfm_benchmark/EnzymeCommission/EnzymeCommission_train.lmdb
[[32m2024-01-05 05:30:47.453[0m][[36mINFO[0m]: Set default args in DownstreamLMDBDataset
[[32m2024-01-05 05:30:47.453[0m][[36mINFO[0m]: Alphabet: {'<cls>': 0, '<pad>': 1, '<eos>': 2, '<unk>': 3, 'L': 4, 'A': 5, 'G': 6, 'V': 7, 'S': 8, 'E': 9, 'R': 10, 'T': 11, 'I': 12, 'D': 13, 'P': 14, 'K': 15, 'Q': 16, 'N': 17, 'F': 18, 'Y': 19, 'M': 20, 'H': 21, 'W': 22, 'C': 23, 'X': 24, 'B': 25, 'U': 26, 'Z': 27, 'O': 28, '.': 29, '-': 30, '<mask>': 31}
[[32m2024-01-05 05:30:47.521[0m][[36mINFO[0m]: Load EnzymeCommission valid dataset from /mnta/yaosen/data/bfm_benchmark/EnzymeCommission/EnzymeCommission_valid.lmdb
[[32m2024-01-05 05:30:47.521[0m][[36mINFO[0m]: Set default args in DownstreamLMDBDataset
[[32m2024-01-05 05:30:47.522[0m][[36mINFO[0m]: Alphabet: {'<cls>': 0, '<pad>': 1, '<eos>': 2, '<unk>': 3, 'L': 

In [79]:
def load_seq_target(dset):
    seq, Y = [], []
    for item in tqdm(dset):
        seq.append(item["aa"])
        if DownstreamLMDBDataset.TASKINFO[dset.task_name]["type"] == "multi_classification":
            n_class = len(DownstreamLMDBDataset.TASKINFO[dset.task_name]["classes"])
            y = np.zeros(n_class)
            y[item["target"].squeeze()] = 1
            Y.append(y)
        else:
            Y.append(item["target"])
    seq, Y = seq, np.array(Y).squeeze()
    return seq, Y

seq_train, Y_train = load_seq_target(trainset)
seq_val, Y_val = load_seq_target(valset)
seq_test, Y_test = load_seq_target(testset_dict["test"])

100%|██████████| 15550/15550 [00:14<00:00, 1094.32it/s]
100%|██████████| 1729/1729 [00:01<00:00, 1036.57it/s]
100%|██████████| 1919/1919 [00:01<00:00, 1143.23it/s]


In [81]:
fp = ProteinkmerHistogram(trainset.vocab, 3)
X_train = np.array(Parallel(n_jobs=16)(delayed(fp)(i) for i in tqdm(seq_train)))
X_val = np.array(Parallel(n_jobs=16)(delayed(fp)(i) for i in tqdm(seq_val)))
X_test = np.array(Parallel(n_jobs=16)(delayed(fp)(i) for i in tqdm(seq_test)))

  0%|          | 16/15550 [00:03<54:14,  4.77it/s]

[2024-01-05 05:33:39,904] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  1%|          | 96/15550 [00:06<08:03, 31.95it/s]

[2024-01-05 05:33:42,620] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,622] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,627] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,652] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,655] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,677] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,678] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,690] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-01-05 05:33:42,704] [INFO] [real_a

  1%|          | 192/15550 [00:06<02:29, 102.94it/s]

[2024-01-05 05:33:42,932] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  2%|▏         | 384/15550 [00:06<00:52, 289.57it/s]

[2024-01-05 05:33:43,191] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


100%|██████████| 15550/15550 [00:16<00:00, 948.34it/s] 
100%|██████████| 1729/1729 [00:01<00:00, 1607.10it/s]
100%|██████████| 1919/1919 [00:00<00:00, 2519.17it/s]


In [82]:
params = {'device': 'cuda', 'seed': 13, 'verbosity': 2, }# 'objective': 'binary:logistic'}
if DownstreamLMDBDataset.TASKINFO[args.task_name]['type'] in ['classification', 'binary']:
    print("classification")
    model = XGBClassifier(**params)
elif DownstreamLMDBDataset.TASKINFO[args.task_name]['type'] == 'regression':
    print("regression")
    model = XGBRegressor(**params)
elif DownstreamLMDBDataset.TASKINFO[args.task_name]['type'] == 'multi_classification':
    print("multilabel")
    # n_class = len(DownstreamLMDBDataset.TASKINFO[trainset.task_name]["classes"])
    xgb_estimator = XGBClassifier(objective='binary:logistic', **params)
    # create MultiOutputClassifier instance with XGBoost model inside
    model = MultiOutputClassifier(xgb_estimator)

multilabel


In [83]:
model.fit(X_train, Y_train, )

In [35]:
scipy.stats.spearmanr(model.predict(X_val), Y_val)

SignificanceResult(statistic=0.6817600413240273, pvalue=2.490754571123554e-72)

In [36]:
scipy.stats.spearmanr(model.predict(X_test), Y_test)

SignificanceResult(statistic=0.7040444128032343, pvalue=4.981385753838842e-79)

In [86]:
f1_max(torch.from_numpy(model.predict(X_test)).float(), torch.from_numpy(Y_test))

tensor(nan, dtype=torch.float64)

In [59]:
(model.predict(X_val)== Y_val).mean()

0.5307719672714336

In [60]:
(model.predict(X_test)== Y_test).mean()

0.4579877389109268

In [8]:
np.square(model.predict(X_val) - Y_val).mean()

NameError: name 'model' is not defined

In [61]:
import xgboost as xgb
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

# create sample dataset
X, y = make_multilabel_classification(n_samples=3000, n_features=45, n_classes=20, n_labels=1,
                                      allow_unlabeled=False, random_state=42)


In [62]:
X.shape

(3000, 45)

In [64]:
y[0]

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [85]:
import torch
def f1_max(pred, target):
    """
    F1 score with the optimal threshold.

    This function first enumerates all possible thresholds for deciding positive and negative
    samples, and then pick the threshold with the maximal F1 score.

    Parameters:
        pred (Tensor): predictions of shape :math:`(B, N)`
        target (Tensor): binary targets of shape :math:`(B, N)`
    """
    order = pred.argsort(descending=True, dim=1)
    target = target.gather(1, order)
    precision = target.cumsum(1) / torch.ones_like(target).cumsum(1)
    recall = target.cumsum(1) / (target.sum(1, keepdim=True) + 1e-10)
    is_start = torch.zeros_like(target).bool()
    is_start[:, 0] = 1
    is_start = torch.scatter(is_start, 1, order, is_start)

    all_order = pred.flatten().argsort(descending=True)
    order = (
        order
        + torch.arange(order.shape[0], device=order.device).unsqueeze(1)
        * order.shape[1]
    )
    order = order.flatten()
    inv_order = torch.zeros_like(order)
    inv_order[order] = torch.arange(order.shape[0], device=order.device)
    is_start = is_start.flatten()[all_order]
    all_order = inv_order[all_order]
    precision = precision.flatten()
    recall = recall.flatten()
    all_precision = precision[all_order] - torch.where(
        is_start, torch.zeros_like(precision), precision[all_order - 1]
    )
    all_precision = all_precision.cumsum(0) / is_start.cumsum(0)
    all_recall = recall[all_order] - torch.where(
        is_start, torch.zeros_like(recall), recall[all_order - 1]
    )
    all_recall = all_recall.cumsum(0) / pred.shape[0]
    all_f1 = 2 * all_precision * all_recall / (all_precision + all_recall + 1e-10)
    return all_f1.max()
