In [7]:
import torch
from cafa_5.dataset import CAFA5Dataset, collate_tok_mult_out_batch
from cafa_5.transform import CharTokenizer, MultiOutputBinarizer
from cafa_5.model import CAFA5Transformer, StepLRScheduler

In [8]:
# Load data and fit transforms
cafa_5_train_data = CAFA5Dataset(
    "../kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta",
    "../kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv",
    "../kaggle/input/cafa-5-protein-function-prediction/IA.txt",
    CharTokenizer(max_size=1024, padding=True),
    MultiOutputBinarizer()
)
cafa_5_train_data.fit()

In [10]:
cafa_5_train_data.prots_ids[0]

'P20536'

In [11]:
print(cafa_5_train_data.prots_seqs[0])

MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIPDKFFIQLKQPLRNKRVCVCGIDPYPKDGTGVPFESPNFTKKSIKEIASSISRLTGVIDYKGYNLNIIDGVIPWNYYLSCKLGETKSHAIYWDKISKLLLQHITKHVSVLYCLGKTDFSNIRAKLESPVTTIVGYHPAARDRQFEKDRSFEIINVLLELDNKVPINWAQGFIY


In [4]:
cafa_5_transformer = CAFA5Transformer(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 512,
    },
    transformer_kwargs = {
        "num_encoder_layers": 1
    },
    feed_forward_kwargs = {
        "num_layers": 0,
        "hidden_size": 2048,
        "hidden_activation": torch.nn.ReLU(),
        "dropout": 0.1
    }
)
cafa_5_transformer.to("cuda")
display(cafa_5_transformer)

optimizer_kwargs = {
    "betas": (0.9, 0.999),
    "eps": 1e-08,
    "amsgrad": True,
}
cafa_5_transformer.fit(
    cafa_5_train_data,
    epochs=8,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    loss_fn = ClassWeightedBCELoss(torch.tensor(cafa_5_train_data.go_codes_info_accr_weights, device="cuda")),
    optimizer_type = torch.optim.Adam,
    optimizer_kwargs = optimizer_kwargs,
    lr_scheduler_type = StepLRScheduler,
    lr_scheduler_kwargs = {
        "d_model": 512
    },
    verbose = True,
    validation_size = 0.1
)

CAFA5Transformer(
  (embedding): Embedding(28, 512)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (feed_forward): FCNN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (mlp): Sequential(
      (0): Linear(in_features=512, out_features=43248, 

  "input_ids": torch.nested.to_padded_tensor(torch.nested.nested_tensor(sequences_input_ids), 0).squeeze(),
- Epoch: 0, Mode: train, Loss: 0.011399 (0.007525),F-score : 0.240419 (0.310132): 100%|██████████| 4001/4001 [05:54<00:00, 11.29it/s]
- Epoch: 0, Mode: validation, Loss: 0.009455, F-score : 0.279684: 100%|██████████| 445/445 [06:13<00:00,  1.19it/s]  
- Epoch: 1, Mode: train, Loss: 0.009321 (0.007506),F-score : 0.243201 (0.250872): 100%|██████████| 4001/4001 [05:53<00:00, 11.31it/s]
- Epoch: 1, Mode: validation, Loss: 0.009394, F-score : 0.244667: 100%|██████████| 445/445 [06:12<00:00,  1.19it/s]  
- Epoch: 2, Mode: train, Loss: 0.009255 (0.007489),F-score : 0.245312 (0.267309): 100%|██████████| 4001/4001 [05:53<00:00, 11.33it/s]
- Epoch: 2, Mode: validation, Loss: 0.009344, F-score : 0.257933: 100%|██████████| 445/445 [06:11<00:00,  1.20it/s]  
- Epoch: 3, Mode: train, Loss: 0.009204 (0.007403),F-score : 0.245954 (0.256502): 100%|██████████| 4001/4001 [05:52<00:00, 11.35it/s]
- 

In [5]:
cafa_5_transformer = CAFA5Transformer(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 512,
    },
    transformer_kwargs = {
        "num_encoder_layers": 2
    },
    feed_forward_kwargs = {
        "num_layers": 0,
        "hidden_size": 2048,
        "hidden_activation": torch.nn.ReLU(),
        "dropout": 0.1
    }
)
cafa_5_transformer.to("cuda")
display(cafa_5_transformer)

optimizer_kwargs = {
    "betas": (0.9, 0.999),
    "eps": 1e-08,
    "amsgrad": True,
}
cafa_5_transformer.fit(
    cafa_5_train_data,
    epochs=16,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    loss_fn = ClassWeightedBCELoss(torch.tensor(cafa_5_train_data.go_codes_info_accr_weights, device="cuda")),
    optimizer_type = torch.optim.Adam,
    optimizer_kwargs = optimizer_kwargs,
    lr_scheduler_type = StepLRScheduler,
    lr_scheduler_kwargs = {
        "d_model": 512
    },
    verbose = True,
    validation_size = 0.1
)

CAFA5Transformer(
  (embedding): Embedding(28, 512)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (feed_forward): FCNN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (mlp): Sequential(
      (0): Linear(in_features=512, out_features=4

- Epoch: 0, Mode: train, Loss: 0.011542 (0.007283),F-score : 0.240606 (0.237337): 100%|██████████| 4001/4001 [09:23<00:00,  7.10it/s]
- Epoch: 0, Mode: validation, Loss: 0.009188, F-score : 0.21535: 100%|██████████| 445/445 [09:43<00:00,  1.31s/it]   
- Epoch: 1, Mode: train, Loss: 0.00936 (0.007191),F-score : 0.24058 (0.232324): 100%|██████████| 4001/4001 [09:23<00:00,  7.10it/s]  
- Epoch: 1, Mode: validation, Loss: 0.009149, F-score : 0.220481: 100%|██████████| 445/445 [09:42<00:00,  1.31s/it]  
- Epoch: 2, Mode: train, Loss: 0.009315 (0.007084),F-score : 0.239609 (0.249423): 100%|██████████| 4001/4001 [09:23<00:00,  7.10it/s]
- Epoch: 2, Mode: validation, Loss: 0.009116, F-score : 0.218441: 100%|██████████| 445/445 [09:43<00:00,  1.31s/it]  
- Epoch: 3, Mode: train, Loss: 0.009273 (0.007153),F-score : 0.238978 (0.254998): 100%|██████████| 4001/4001 [09:24<00:00,  7.09it/s]
- Epoch: 3, Mode: validation, Loss: 0.009072, F-score : 0.230299: 100%|██████████| 445/445 [09:44<00:00,  1.31

In [6]:
cafa_5_transformer = CAFA5Transformer(
    amino_acids_vocab = cafa_5_train_data.seq_transform.vocab,
    go_codes_vocab = cafa_5_train_data.go_codes_ids,
    embedding_kwargs = {
        "embedding_dim": 512,
    },
    transformer_kwargs = {
        "num_encoder_layers": 4
    },
    feed_forward_kwargs = {
        "num_layers": 0,
        "hidden_size": 2048,
        "hidden_activation": torch.nn.ReLU(),
        "dropout": 0.1
    }
)
cafa_5_transformer.to("cuda")
display(cafa_5_transformer)

optimizer_kwargs = {
    "betas": (0.9, 0.999),
    "eps": 1e-08,
    "amsgrad": True,
}
cafa_5_transformer.fit(
    cafa_5_train_data,
    epochs=32,
    batch_size=32,
    collate_fn = collate_tok_mult_out_batch,
    loss_fn = ClassWeightedBCELoss(torch.tensor(cafa_5_train_data.go_codes_info_accr_weights, device="cuda")),
    optimizer_type = torch.optim.Adam,
    optimizer_kwargs = optimizer_kwargs,
    lr_scheduler_type = StepLRScheduler,
    lr_scheduler_kwargs = {
        "d_model": 512
    },
    verbose = True,
    validation_size = 0.1
)

CAFA5Transformer(
  (embedding): Embedding(28, 512)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (feed_forward): FCNN(
    (hidden_activation): ReLU()
    (output_activation): Sigmoid()
    (mlp): Sequential(
      (0): Linear(in_features=512, out_features=4

- Epoch: 0, Mode: train, Loss: 0.011675 (0.01087),F-score : 0.238192 (0.142972): 100%|██████████| 4001/4001 [16:27<00:00,  4.05it/s] 
- Epoch: 0, Mode: validation, Loss: 0.009363, F-score : 0.216896: 100%|██████████| 445/445 [16:48<00:00,  2.27s/it]   
- Epoch: 1, Mode: train, Loss: 0.009451 (0.010764),F-score : 0.236776 (0.143306): 100%|██████████| 4001/4001 [16:26<00:00,  4.06it/s]
- Epoch: 1, Mode: validation, Loss: 0.009326, F-score : 0.216896: 100%|██████████| 445/445 [16:48<00:00,  2.27s/it]   
- Epoch: 2, Mode: train, Loss: 0.009403 (0.010707),F-score : 0.23516 (0.143291): 100%|██████████| 4001/4001 [16:28<00:00,  4.05it/s] 
- Epoch: 2, Mode: validation, Loss: 0.009282, F-score : 0.216896: 100%|██████████| 445/445 [16:50<00:00,  2.27s/it]   
- Epoch: 3, Mode: train, Loss: 0.009365 (0.010655),F-score : 0.234279 (0.192826): 100%|██████████| 4001/4001 [16:28<00:00,  4.05it/s]
- Epoch: 3, Mode: validation, Loss: 0.009265, F-score : 0.255949: 100%|██████████| 445/445 [16:50<00:00,  2

KeyboardInterrupt: 