In [6]:
import torch
from cafa_5.dataset import CAFA5Dataset, collate_tok_mult_out_batch
from cafa_5.transform import CharTokenizer, MultiOutputBinarizer
from cafa_5.model import CAFA5LSTM

In [7]:
# Load data and fit transforms
cafa_5_train_data = CAFA5Dataset(
    "../kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta",
    "../kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv",
    "../kaggle/input/cafa-5-protein-function-prediction/IA.txt",
    CharTokenizer(max_size=4096, padding=True),
    MultiOutputBinarizer()
)
cafa_5_train_data.fit()

In [3]:
# Initialize & train small LSTM
cafa_5_small_lstm = CAFA5LSTM(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 1024,
    },
    lstm_kwargs = {
        "hidden_size": 1024,
        "num_layers": 1,
        "dropout": 0.1,
        "bidirectional": True
    },
    fcnn_kwargs = {
        "hidden_size": 1024,
        "num_layers": 0,
        "dropout": 0.1,
        "hidden_activation": torch.nn.ReLU()
    }
)
cafa_5_small_lstm.to("cuda")
display(cafa_5_small_lstm)

optimizer_kwargs = {
    "lr": 1e0,
    "momentum": 0.9,
    "nesterov": True
}
cafa_5_small_lstm.fit(
    cafa_5_train_data,
    optimizer_kwargs = optimizer_kwargs,
    epochs=2,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    verbose = True,
    validation_size=0.1,
)



CAFA5LSTM(
  (embedding): Embedding(27, 1024)
  (lstm): LSTM(1024, 1024, batch_first=True, dropout=0.1, bidirectional=True)
  (fcnn): FCNN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (mlp): Sequential(
      (0): Linear(in_features=2048, out_features=43248, bias=True)
      (1): Sigmoid()
    )
  )
)

- Epoch: 0, Mode: train, Loss: 0.01421, F-score : 0.231301: 100%|██████████| 4001/4001 [20:28<00:00,  3.26it/s] 
- Epoch: 0, Mode: validation, Loss: 0.004371, F-score : 0.214771: 100%|██████████| 445/445 [21:33<00:00,  2.91s/it]    
- Epoch: 1, Mode: train, Loss: 0.00423, F-score : 0.233661: 100%|██████████| 4001/4001 [20:17<00:00,  3.29it/s] 
- Epoch: 1, Mode: validation, Loss: 0.004162, F-score : 0.214771: 100%|██████████| 445/445 [21:22<00:00,  2.88s/it]    


In [4]:
# Initialize & train medium LSTM
cafa_5_medium_lstm = CAFA5LSTM(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 1024,
    },
    lstm_kwargs = {
        "hidden_size": 1024,
        "num_layers": 2,
        "dropout": 0.1,
        "bidirectional": True
    },
    fcnn_kwargs = {
        "hidden_size": 1024,
        "num_layers": 1,
        "dropout": 0.1,
        "hidden_activation": torch.nn.ReLU()
    }
)
cafa_5_medium_lstm.to("cuda")
display(cafa_5_medium_lstm)

optimizer_kwargs = {
    "lr": 1e0,
    "momentum": 0.9,
    "nesterov": True
}
cafa_5_medium_lstm.fit(
    cafa_5_train_data,
    optimizer_kwargs = optimizer_kwargs,
    epochs=4,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    verbose = True,
    validation_size=0.1,
)

CAFA5LSTM(
  (embedding): Embedding(27, 1024)
  (lstm): LSTM(1024, 1024, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (fcnn): FCNN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (mlp): Sequential(
      (0): Linear(in_features=2048, out_features=1024, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=1024, out_features=43248, bias=True)
      (4): Sigmoid()
    )
  )
)

- Epoch: 0, Mode: train, Loss: 0.015615, F-score : 0.238702: 100%|██████████| 4001/4001 [32:57<00:00,  2.02it/s]  
- Epoch: 0, Mode: validation, Loss: 0.004056, F-score : 0.252228: 100%|██████████| 445/445 [34:37<00:00,  4.67s/it]    
- Epoch: 1, Mode: train, Loss: 0.004196, F-score : 0.239647:  95%|█████████▌| 3806/4001 [45:21<02:19,  1.40it/s]  


KeyboardInterrupt: 

In [3]:
# Initialize & train large LSTM
cafa_5_large_lstm = CAFA5LSTM(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 1024,
    },
    lstm_kwargs = {
        "hidden_size": 1024,
        "num_layers": 3,
        "dropout": 0.1,
        "bidirectional": True
    },
    fcnn_kwargs = {
        "hidden_size": 1024,
        "num_layers": 2,
        "dropout": 0.1,
        "hidden_activation": torch.nn.ReLU()
    }
)
cafa_5_large_lstm.to("cuda")
display(cafa_5_large_lstm)

optimizer_kwargs = {
    "lr": 1e0,
    "momentum": 0.9,
    "nesterov": True
}
cafa_5_large_lstm.fit(
    cafa_5_train_data,
    optimizer_kwargs = optimizer_kwargs,
    epochs=2,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    verbose = True,
    validation_size=0.1,
)

CAFA5LSTM(
  (embedding): Embedding(27, 1024)
  (lstm): LSTM(1024, 1024, num_layers=3, batch_first=True, dropout=0.1, bidirectional=True)
  (fcnn): FCNN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (mlp): Sequential(
      (0): Linear(in_features=2048, out_features=1024, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=1024, out_features=1024, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.1, inplace=False)
      (6): Linear(in_features=1024, out_features=43248, bias=True)
      (7): Sigmoid()
    )
  )
)

- Epoch: 0, Mode: train, Loss: 0.026346, F-score : 0.232949:  77%|███████▋  | 3077/4001 [57:58<17:24,  1.13s/it]  


KeyboardInterrupt: 