In [3]:
import pandas as pd
import torch

from cafa_5.dataset import CAFA5Dataset, collate_data_dict
from cafa_5.model import CAFA5EmbeddingsFFN

device = "cuda" if torch.cuda.is_available() and True else "cpu"

In [2]:
cafa_5_test_set = CAFA5Dataset(
    prots_amino_acids_fasta_path = "../kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta",
    go_codes_info_accr_weights_txt_path = "../kaggle/input/cafa-5-protein-function-prediction/IA.txt",
    go_code_graph_obo_path = "../kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo",
    prots_t5_embeds_npy_path = "../kaggle/input/t5embeds/test_embeds.npy",
    prots_protbert_embeds_npy_path = "../kaggle/input/protbert-embeddings-for-cafa5/test_embeddings.npy",
    prots_esm2_embeds_npy_path = "../kaggle/input/4637427/test_embeds_esm2_t36_3B_UR50D.npy"    
)

In [3]:
cafa_5_model = CAFA5EmbeddingsFFN(
    n_go_codes = len(cafa_5_test_set.go_codes),
    num_layers = 4,
    hidden_size = 2048,
    hidden_activation = torch.nn.ReLU(),
    dropout = 0.1,
    batch_normalization = True,
    residual_connections = True
)
cafa_5_model.to(device)
display(cafa_5_model)
n_params = 0
for params in cafa_5_model.parameters():
    n_params += params.numel()
print("# of parameters:", n_params)

CAFA5EmbeddingsFFN(
  (ffn): FFN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (ffn): ModuleDict(
      (linear_0): Linear(in_features=4608, out_features=2048, bias=True)
      (batch_norm_0): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation_0): ReLU()
      (dropout_0): Dropout(p=0.1, inplace=False)
      (linear_1): Linear(in_features=2048, out_features=2048, bias=True)
      (batch_norm_1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation_1): ReLU()
      (dropout_1): Dropout(p=0.1, inplace=False)
      (linear_2): Linear(in_features=2048, out_features=2048, bias=True)
      (batch_norm_2): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation_2): ReLU()
      (dropout_2): Dropout(p=0.1, inplace=False)
      (linear_3): Linear(in_features=2048, out_features=2048, bias=True)
      (batch_norm_3): BatchNorm1d(2048

# of parameters: 110669040


In [4]:
cafa_5_model.load_state_dict(torch.load("../kaggle/weights/4_ffn/weights_61.pt")["state_dict"])
cafa_5_model.predict_proba(
    cafa_5_test_set,
    batch_size = 32,
    collate_fn = collate_data_dict,
    verbose = True,
)

100%|██████████| 4434/4434 [04:19<00:00, 17.11it/s]


In [65]:
prot_t5_bert_esm2_submission_df = pd.read_csv("submission.tsv", sep = "\t")
prot_t5_bert_esm2_submission_df.columns = (prot_t5_bert_esm2_submission_df.columns[0:-1].tolist() + 
                                           ["Prediction (T5/BERT/ESM2)"])

sub_total_submission_df = pd.read_csv("../kaggle/input/best_submissions/submission.tsv", sep="\t", header = None)
sub_total_submission_df.columns = (prot_t5_bert_esm2_submission_df.columns[0:-1].tolist() + 
                                           ["Prediction (Total)"])
sub_total_submission_df.loc[sub_total_submission_df["Prediction (Total)"] > 1, "Prediction (Total)"] = 1 

total_submission_df = sub_total_submission_df.merge(
    prot_t5_bert_esm2_submission_df, on = ["Protein Id", "GO Term Id"], how = "outer"
)

total_submission_df["Prediction"] = (total_submission_df["Prediction (T5/BERT/ESM2)"].fillna(0) + total_submission_df["Prediction (Total)"].fillna(0))/2.
total_submission_df.drop_duplicates(["Protein Id", "GO Term Id"], inplace = True)

total_submission_df[["Protein Id", "GO Term Id", "Prediction"]].to_csv("../kaggle/input/total-submission/submission2.tsv", header = False, index = False, sep = "\t")

In [None]:
prot_t5_bert_esm2_submission_df = pd.read_csv("submission.tsv", sep = "\t")
prot_t5_bert_esm2_submission_df.columns = (prot_t5_bert_esm2_submission_df.columns[0:-1].tolist() + 
                                           ["Prediction (T5/BERT/ESM2)"])

sub_total_submission_df = pd.read_csv("../kaggle/input/best_submissions/submission.tsv", sep="\t", header = None)
sub_total_submission_df.columns = (prot_t5_bert_esm2_submission_df.columns[0:-1].tolist() + 
                                           ["Prediction (Total)"])
sub_total_submission_df.loc[sub_total_submission_df["Prediction (Total)"] > 1, "Prediction (Total)"] = 1 

total_submission_df = sub_total_submission_df.merge(
    prot_t5_bert_esm2_submission_df, on = ["Protein Id", "GO Term Id"], how = "outer"
)

total_submission_df["Prediction"] = 0.4*total_submission_df["Prediction (T5/BERT/ESM2)"].fillna(0) + 0.6*total_submission_df["Prediction (Total)"].fillna(0)
total_submission_df.drop_duplicates(["Protein Id", "GO Term Id"], inplace = True)

total_submission_df[["Protein Id", "GO Term Id", "Prediction"]].to_csv("../kaggle/input/total-submission/submission3.tsv", header = False, index = False, sep = "\t")

In [1]:
w1 = 0.38
w2 = 0.56
z = w1 + w2
n_w1 = w1/z
n_w2 = w2/z

In [2]:
n_w1, n_w2

(0.40425531914893614, 0.5957446808510638)