forked from https://www.kaggle.com/code/siddhvr/foldseek-blastp-ensemble

just to be noted:

this is the same notebook which was submitted in CAFA5 for demonstration purpose (not much changes have been made)
it utilizes Foldseek/Blasp model finetuned on CAFA5 dataset (try training the model on CAFA6)
submission utilizes ensemble for cafa5 notebook (scroll all the way down to check)

Since, running Foldseek on Kaggle gives MMseq2, I directly used the foldseek test set submission. You can refer to this amazing notebook on Foldseek by :RAMAN for the entire code https://www.kaggle.com/code/samusram/leveraging-foldseek

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/ncbi-blast-2.14.0+-x64-linux.tar.gz

In [None]:
!tar zxvpf ncbi-blast-2.14.0+-x64-linux.tar.gz

In [None]:
!cp /kaggle/working/ncbi-blast-2.14.0+/bin/* /opt/conda/bin

In [None]:
!pip install git+https://github.com/SamusRam/ProFun.git

In [None]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm, trange
from Bio import SeqIO
import numpy as np

from profun.models import BlastMatching, BlastConfig
from profun.utils.project_info import ExperimentInfo

## Obtaining train data

In [None]:
data_root = Path('/kaggle/input/cafa-6-protein-function-prediction/')
train_terms = pd.read_csv(data_root/"Train/train_terms.tsv",sep="\t")

ids = []
seqs = []
with open(data_root/"Train/train_sequences.fasta") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        ids.append(record.id)
        seqs.append(str(record.seq))
train_seqs_df = pd.DataFrame({'EntryID': ids, 'Seq': seqs})
train_df_long = train_terms.merge(train_seqs_df, on='EntryID')

## Init model

In [None]:
experiment_info = ExperimentInfo(validation_schema='public_lb', 
                                 model_type='blast', model_version='1nn')

config = BlastConfig(experiment_info=experiment_info, 
                     id_col_name='EntryID', 
                     target_col_name='term', 
                     seq_col_name='Seq', 
                     class_names=list(train_df_long['term'].unique()), 
                     optimize_hyperparams=False, 
                     n_calls_hyperparams_opt=None,
                    hyperparam_dimensions=None,
                    per_class_optimization=None,
                    class_weights=None,
                    n_neighbours=5,
                    e_threshold=0.02,
                     n_jobs=100,
                     pred_batch_size=10
                    )

blast_model = BlastMatching(config)

## Train model

In [None]:
#blast_model.fit(train_df_long)

In [None]:
#obtained by training offline
test_pred_df_blast = pd.read_csv('/kaggle/input/proteinet-best/blast_submission.tsv',
    sep='\t', header=None).drop(0, axis=1)

submission_best_public = pd.read_csv('/kaggle/input/protbert-ensemble/submission.tsv',
    sep='\t', header=None, names=['Id', 'GO term', 'Confidence'])


In [None]:
submissions_merged = submission_best_public.merge(test_pred_df_blast, left_on=['Id', 'GO term'], 
                                                  right_on=[1, 2], how='outer')

In [None]:
submissions_merged.drop([1, 2], axis=1, inplace=True)
submissions_merged['confidence_combined'] = submissions_merged.apply(lambda row: row['Confidence'] if not np.isnan(row['Confidence']) else row[3], axis=1)


In [None]:
submissions_merged[['Id', 'GO term', 'confidence_combined']].to_csv('submission.tsv',
    sep='\t', header=False, index=False)
