In [1]:
import torch
from cafa_5.dataset import CAFA5Dataset, collate_tok_mult_out_batch
from cafa_5.transform import CharTokenizer, MultiOutputBinarizer
from cafa_5.model import CAFA5LSTM

In [2]:
# Load data and fit transforms
cafa_5_train_data = CAFA5Dataset(
    "../kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta",
    "../kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv",
    "../kaggle/input/cafa-5-protein-function-prediction/IA.txt",
    CharTokenizer(max_size=4096),
    MultiOutputBinarizer()
)
cafa_5_train_data.fit()

In [3]:
# Initialize & train small LSTM
cafa_5_small_lstm = CAFA5LSTM(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 1024,
    },
    lstm_kwargs = {
        "hidden_size": 1024,
        "num_layers": 1,
        "dropout": 0.1,
        "bidirectional": True
    },
    fcnn_kwargs = {
        "hidden_size": 1024,
        "num_layers": 0,
        "dropout": 0.1,
        "hidden_activation": torch.nn.ReLU()
    }
)
cafa_5_small_lstm.to("cuda")
display(cafa_5_small_lstm)

cafa_5_small_lstm.fit(
    cafa_5_train_data,
    optimizer_kwargs = {"lr": 1e0},
    epochs=3,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    verbose = True,
    validation_size=0.1,
)



CAFA5LSTM(
  (embedding): Embedding(27, 1024)
  (lstm): LSTM(1024, 1024, batch_first=True, dropout=0.1, bidirectional=True)
  (fcnn): FCNN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (mlp): Sequential(
      (0): Linear(in_features=2048, out_features=43248, bias=True)
      (1): Sigmoid()
    )
  )
)

- Epoch: 0, Mode: train, Loss: 0.302192, F-score : 0.169596:  18%|█▊        | 704/4001 [03:20<15:49,  3.47it/s]

In [None]:
# Initialize & train large LSTM
cafa_5_medium_lstm = CAFA5LSTM(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 2048,
    },
    lstm_kwargs = {
        "hidden_size": 2048,
        "num_layers": 2,
        "dropout": 0.1,
        "bidirectional": True
    },
    fcnn_kwargs = {
        "hidden_size": 2048,
        "num_layers": 1,
        "dropout": 0.1,
        "hidden_activation": torch.nn.ReLU()
    }
)
cafa_5_medium_lstm.to("cuda")
display(cafa_5_medium_lstm)

cafa_5_medium_lstm.fit(
    cafa_5_train_data,
    optimizer_kwargs = {"lr": 1e0},
    epochs=3,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    verbose = True,
    validation_size=0.1,
)

In [None]:
# Initialize & train large LSTM
cafa_5_large_lstm = CAFA5LSTM(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 4096,
    },
    lstm_kwargs = {
        "hidden_size": 4096,
        "num_layers": 3,
        "dropout": 0.1,
        "bidirectional": True
    },
    fcnn_kwargs = {
        "hidden_size": 4096,
        "num_layers": 2,
        "dropout": 0.1,
        "hidden_activation": torch.nn.ReLU()
    }
)
cafa_5_large_lstm.to("cuda")
display(cafa_5_large_lstm)

cafa_5_large_lstm.fit(
    cafa_5_train_data,
    optimizer_kwargs = {"lr": 1e0},
    epochs=3,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    verbose = True,
    validation_size=0.1,
)