# Titulo de ejemplo

In [1]:
import pandas as pd

SEED = 0
FRAC_VAL = 0.2

train_df = pd.read_csv('./datasets/all_veltri_2.csv', index_col = 0).sample(frac=1, random_state=SEED)
df_pos = pd.read_csv('./datasets/veltri_dramp_cdhit_90_2.csv', index_col = 0)
df_neg = pd.read_csv('./datasets/non_amp_ampep_cdhit90_2.csv', index_col = 0)

val_df_pos = df_pos.sample(frac=FRAC_VAL, random_state=SEED)
val_df_neg = df_neg.sample(frac=FRAC_VAL, random_state=SEED)
test_df_pos = df_pos.drop(val_df_pos.index)
test_df_neg = df_neg.drop(val_df_neg.index)

val_df = pd.concat([val_df_pos, val_df_neg]).sample(frac=1, random_state=SEED)
test_df = pd.concat([test_df_pos, test_df_neg]).sample(frac=1, random_state=SEED)

print(f"Validation dataframe: {len(val_df_pos)} positives, {len(val_df_neg)} negatives ({len(val_df_pos)/len(val_df)}%)")
print(f"Test dataframe: {len(test_df_pos)} positives, {len(test_df_neg)} negatives ({len(test_df_pos)/len(test_df)}%)")

Validation dataframe: 413 positives, 382 negatives (0.519496855345912%)
Test dataframe: 1652 positives, 1526 negatives (0.5198237885462555%)


# Funciones random go BRRRR

In [1]:
import torch
import torch.nn as nn

class CNNBiLSTM(nn.Module):
    def __init__(self, num_preds):
        r"""
            Compone una red convolucional con una red Long Short-Term Memory
            
            Usa un kernel fijo de 100
        """
        super().__init__()
        self.conv = torch.nn.Conv1d(
            in_channels = num_preds, 
            out_channels = 64,
            kernel_size = 10
        )
        self.lstm = torch.nn.LSTM(
            input_size = 64,
            hidden_size = 64,
            bidirectional = True
        )
        
        
    def forward(
        self,
        aa_preds: torch.Tensor,
    ) -> torch.Tensor:
    
        output = self.conv(aa_preds)
        output = output.transpose(0,1)
        output = self.lstm(output)
        
        return output

In [2]:
import torch
import torch.nn as nn
from typing import Optional
from transformers import BertForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from MultiGPUModels import MultiGPUBertModel

class MultiGPUBertForPeptideAAClassification(torch.nn.Module):
    def __init__(self, bert_model_for_class, biochem_global_cols, biochem_aa_cols = list(range(133))):
        super().__init__()
                
        self.num_labels = bert_model_for_class.num_labels

        self.bert = MultiGPUBertModel(bert_model_for_class.bert)
        self.bert_dropout = nn.Dropout(0.01).to("cuda:0")

        self.biochem_aa_cols = biochem_aa_cols
        self.cnn_rnn = CNNBiLSTM(len(biochem_aa_cols)).to("cuda:1")
        self.cnn_rnn_dropout = nn.Dropout(0.01).to("cuda:1")

        self.biochem_global_cols = biochem_global_cols
        
        self.classifier = nn.Linear(
            bert_model_for_class.config.hidden_size + len(self.biochem_global_cols) + len(self.biochem_aa_cols), 
            self.num_labels
        ).to("cuda:0")
        
    def forward(
            self,
            dataloader_item = None,
            input_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            biochem_global_info: torch.Tensor = None,
            biochem_aa_info: torch.Tensor = None,
        ) -> SequenceClassifierOutput:
            r"""
                Se entrena por un lado el Encoder, con los input_ids, as attention_mask y todo eso
                
                Por otro lado, la BiLSTM se entrena con la informacion por aminoácido
                
                Finalmente se agregan los hidden_states de ambos componentes junto con los predictores
                globales para clasificar y obtener las probabilidades para cada clase
            """
            
            if dataloader_item != None:
                input_ids = dataloader_item['input_ids'].to("cuda:0")
                attention_mask = dataloader_item['attention_mask'].to("cuda:0") 
                biochem_global_info = dataloader_item['biochem_global_info'].to("cuda:0") 
                biochem_aa_info = dataloader_item['biochem_aa_info'].to("cuda:1") 
    
            # Primera parte: embeddings del encoder
    
            # Obtengo los embeddings generados por el encoder 
            bert_outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
            )
            bert_pooled_output = outputs[1]
    
            # Aplico dropout sobre este embedding
            bert_dropout_output = self.dropout(bert_pooled_output)
            
            
            # Segunda parte: cell info del CNN_BiLSTM
        
            # Proceso los predictores por aminoácido
            cnn_rnn_output = self.cnn_rnn(biochem_aa_info)
            
            # Me quedo con el cell_state, que representa toda la secuencia
            cell_state = cnn_rnn_output[1][1]
            
            # De nuevo aplico dropout
            cnn_rnn_dropout_output = self.cnn_rnn_dropout(cell_state).to("cuda:0")
            
            
            # Concateno toda la información
            output_with_biochem = torch.cat([
                bert_dropout_output,
                cnn_rnn_dropout_output, 
                biochem_global_info
            ], dim = 1)
            
            # Clasifico 
            logits = self.classifier(output_with_biochem)
            
            return SequenceClassifierOutput(
                loss=None,
                logits=logits,
                hidden_states=pooled_output,
                attentions=outputs.attentions,
            )

In [8]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

class AMP_BioChemDataset(Dataset):
    """
        Esta clase permite formar un Dataset legible para los modelos de PyTorch
        Implementa los métodos necesarios para entrenar un BERT
    """
    def __init__(self, df, biochem_cols, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        super(Dataset, AMP_BioChemDataset).__init__(self)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.df = df
        self.max_len = max_len
        
        self.seqs = list(df['aa_seq'])
        self.biochem_cols = biochem_cols
        self.biochem = df[biochem_cols]
        self.labels = list(df['AMP'].astype(int))
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_enc = self.tokenizer(
            seq, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len,
            return_tensors = 'pt',
            return_attention_mask=True
        )
        seq_label = self.labels[idx]
        seq_biochem = self.biochem.iloc[idx]
        if "molecular_mass" in self.biochem_cols:
            seq_biochem.loc['molecular_mass'] = seq_biochem['molecular_mass'] / 1e4
        seq_biochem.transpose()
        
        return {
            'idx': idx,
            'input_ids': seq_enc['input_ids'].flatten(),
            'attention_mask' : seq_enc['attention_mask'].flatten(),
            'labels' : torch.tensor(seq_label, dtype=torch.long),
            'biochem_global_info': torch.tensor(seq_biochem, dtype=torch.float32),
            'biochem_aa_info': torch.rand(133, len(self.seqs[idx]))
        }
    

class AMP_BioChemDataLoader(DataLoader):
    """
        Es una estructura de datos iterable con mini-batches de datos
    
        dataframe   --  Un dataframe de Pandas con los datos, con columnas 'aa_seq' y 'AMP'
        batch_size  --  El tamaño de mini-batch con el que vas a entrenar el modelo   
    """
    def __init__(self, dataframe, biochem_cols, batch_size):
        DataLoader.__init__(
            self,
            AMP_BioChemDataset(dataframe, biochem_cols),
            batch_size = batch_size,
            num_workers = 2,
            shuffle = True
        )


In [1]:
import pandas as pd

SEED = 0
FRAC_VAL = 0.2

train_df = pd.read_csv('./datasets/all_veltri_2.csv', index_col = 0).sample(frac=1, random_state=SEED)
df_pos = pd.read_csv('./datasets/veltri_dramp_cdhit_90_2.csv', index_col = 0)
df_neg = pd.read_csv('./datasets/non_amp_ampep_cdhit90_2.csv', index_col = 0)

val_df_pos = df_pos.sample(frac=FRAC_VAL, random_state=SEED)
val_df_neg = df_neg.sample(frac=FRAC_VAL, random_state=SEED)
test_df_pos = df_pos.drop(val_df_pos.index)
test_df_neg = df_neg.drop(val_df_neg.index)

val_df = pd.concat([val_df_pos, val_df_neg]).sample(frac=1, random_state=SEED)
test_df = pd.concat([test_df_pos, test_df_neg]).sample(frac=1, random_state=SEED)

print(f"Validation dataframe: {len(val_df_pos)} positives, {len(val_df_neg)} negatives ({len(val_df_pos)/len(val_df)}%)")
print(f"Test dataframe: {len(test_df_pos)} positives, {len(test_df_neg)} negatives ({len(test_df_pos)/len(test_df)}%)")

Validation dataframe: 413 positives, 382 negatives (0.519496855345912%)
Test dataframe: 1652 positives, 1526 negatives (0.5198237885462555%)


In [5]:
from pipeline_tools import train_model, eval_model, compute_metrics
from torch.optim import AdamW
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

BATCH_SIZE = 1
LEARNING_RATE = 5e-6
WEIGHT_DECAY = 0.01
EPOCHS = 1
            
biochem_cols = [
    "molecular_mass",
    "hydrophobic_freq",
    "hydrophilic_freq",
    "basic_freq",
    "acid_freq",
    "charge",
    "aliphatic_index",
    "average_hydrophobicity",
    "isoelectric_point"
]

train_dataloader = AMP_BioChemDataLoader(train_df, biochem_cols, batch_size = BATCH_SIZE)            
test_dataloader = AMP_BioChemDataLoader(test_df, biochem_cols, batch_size = BATCH_SIZE)            

for d in test_dataloader:
    print(d)
    break

# Copiar el modelo para entrenarlo
bert_model = BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')   
multi_gpu_bert = MultiGPUBertForPeptideAAClassification(bert_model, biochem_cols)
            
# Entrenar el modelo con esta configuracion
optimizer = AdamW(
    multi_gpu_bert.parameters(), 
    lr = LEARNING_RATE, 
    weight_decay = WEIGHT_DECAY)
            
total_steps = len(train_dataloader) * EPOCHS
            
scheduler = get_linear_schedule_with_warmup(optimizer, 
    num_warmup_steps = 0, 
    num_training_steps = total_steps)

for i in range(EPOCHS):
    labels, predicted, _ = train_model(multi_gpu_bert, train_dataloader, CrossEntropyLoss(), optimizer, scheduler, True)

# Obtener las métricas de entrenamiento
train_metrics = compute_metrics(labels, predicted)
            
print(f"Metrics for train set: ")
print(train_metrics)

# Obtener las métricas de validacion
test_labels, test_preds = eval_model(multi_gpu_bert, test_dataloader, CrossEntropyLoss(), True)
test_metrics = compute_metrics(test_labels, test_preds)
            
print(f"Metrics for test set: ")
print(test_metrics)

train_metrics.to_csv('./biochem_results/train_metrics_with_biochem_LL_1ep.csv')
test_metrics.to_csv('./biochem_results/test_metrics_with_biochem_LL_1ep.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

{'idx': tensor([612]), 'input_ids': tensor([[ 2, 22, 11, 10, 22, 11, 10, 21, 23, 13, 24, 23, 23, 17, 23, 23, 12,  6,
         12,  7, 23,  7, 19, 23, 23, 12, 19,  3,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1,

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.80 GiB total capacity; 924.16 MiB already allocated; 4.31 MiB free; 938.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
train_dl = AMP_BioChemDataset(train_df, [])

In [11]:
import torch
for d in train_dl:
    print(d)
    break

{'idx': 0, 'input_ids': tensor([ 2, 20,  9,  6,  5,  8, 15, 10, 11,  5,  7, 12,  5, 15,  7,  5, 24, 22,
        17, 14, 10,  8, 14, 19, 21,  7, 22, 11, 23, 20, 19, 13, 13, 13, 16, 12,
        11, 13, 13, 19, 12,  5, 20, 22,  9,  7, 12, 19, 24, 23, 16,  7, 24,  6,
        16, 19,  9,  7, 13, 10, 13, 15, 12, 10, 13, 10,  7, 10, 10, 13,  9,  6,
        15, 12, 14, 19,  8, 13, 12,  6,  5, 18, 17,  7,  5,  8, 15, 18, 18, 14,
         6, 10,  5, 24,  5, 17,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1