# Titulo de ejemplo

In [1]:
import pandas as pd

SEED = 0
FRAC_VAL = 0.2

train_df = pd.read_csv('./datasets/all_veltri_2.csv', index_col = 0).sample(frac=1, random_state=SEED)
df_pos = pd.read_csv('./datasets/veltri_dramp_cdhit_90_2.csv', index_col = 0)
df_neg = pd.read_csv('./datasets/non_amp_ampep_cdhit90_2.csv', index_col = 0)

val_df_pos = df_pos.sample(frac=FRAC_VAL, random_state=SEED)
val_df_neg = df_neg.sample(frac=FRAC_VAL, random_state=SEED)
test_df_pos = df_pos.drop(val_df_pos.index)
test_df_neg = df_neg.drop(val_df_neg.index)

val_df = pd.concat([val_df_pos, val_df_neg]).sample(frac=1, random_state=SEED)
test_df = pd.concat([test_df_pos, test_df_neg]).sample(frac=1, random_state=SEED)

print(f"Validation dataframe: {len(val_df_pos)} positives, {len(val_df_neg)} negatives ({len(val_df_pos)/len(val_df)}%)")
print(f"Test dataframe: {len(test_df_pos)} positives, {len(test_df_neg)} negatives ({len(test_df_pos)/len(test_df)}%)")

Validation dataframe: 413 positives, 382 negatives (0.519496855345912%)
Test dataframe: 1652 positives, 1526 negatives (0.5198237885462555%)


# Funciones random go BRRRR

In [2]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from time import process_time_ns 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from pandas import DataFrame
from itertools import product
from sklearn.model_selection import StratifiedKFold
from copy import deepcopy
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import pandas as pd
from time import process_time_ns
import torch
from pipeline_tools import compute_loss

class AMP_BioChemLLDataset(Dataset):
    """
        Esta clase permite formar un Dataset legible para los modelos de PyTorch
        Implementa los métodos necesarios para entrenar un BERT
    """
    def __init__(self, df, biochem_cols, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        super(Dataset, AMP_BioChemLLDataset).__init__(self)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.df = df
        self.max_len = max_len
        
        self.seqs = list(df['aa_seq'])
        self.biochem = df[biochem_cols]
        self.labels = list(df['AMP'].astype(int))
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_enc = self.tokenizer(
            seq, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len,
            return_tensors = 'pt',
            return_attention_mask=True
        )
        seq_label = self.labels[idx]
        seq_biochem = self.biochem.iloc[idx]
        seq_biochem.loc['molecular_mass'] = seq_biochem['molecular_mass'] / 1e4
        seq_biochem.transpose()
        
        return {
            'idx': idx,
            'input_ids': seq_enc['input_ids'].flatten(),
            'attention_mask' : seq_enc['attention_mask'].flatten(),
            'labels' : torch.tensor(seq_label, dtype=torch.long),
            'biochem_info': torch.tensor(seq_biochem, dtype=torch.float32),
            'zscore': torch.rand(150, 133)
        }
    

class AMP_BioChemLLDataLoader(DataLoader):
    """
        Es una estructura de datos iterable con mini-batches de datos
    
        dataframe   --  Un dataframe de Pandas con los datos, con columnas 'aa_seq' y 'AMP'
        batch_size  --  El tamaño de mini-batch con el que vas a entrenar el modelo   
    """
    def __init__(self, dataframe, biochem_cols, batch_size):
        DataLoader.__init__(
            self,
            AMP_BioChemLLDataset(dataframe, biochem_cols),
            batch_size = batch_size,
            num_workers = 2,
            shuffle = True
        )
        
def train_biochemLL_model(model, data_loader, loss_fn, optimizer, scheduler, verbose = False):
    """
        Entrena un modelo, y devuelve etiquetas reales, predicciones y el loss final
        
        model         -- El modelo a entrenar
        data_loader   -- un dataloader con los ejemplos de entrenamiento
        loss_fn       -- La funcion de loss (MSE, CrossEntropy, etc.)
        optimizer     -- El optimizador del modelo
        scheduler     -- El scheduler del learning rate del optimizador
        verbose       -- True para mostrar informacion del entrenamiento por consola
    """
    
    model = model.train() # Explicitly setting model to train state
    labels = []
    predictions = []
    losses = []
    correct_predictions = 0
    
    # Variables para calcular una media del loss (no afecta al entrenamiento)
    mobile_loss = 0
    MOBILE_COEF = 0.9
    
    i = 0
    for d in data_loader:
        # Medimos el tiempo
        i = i + 1
        start = process_time_ns()

        # Obtenemos los atributos del siguiente batch
        input_ids = d['input_ids'].to("cuda:0")
        attention_mask = d['attention_mask'].to("cuda:0")
        biochem_info = d['biochem_info'].to('cuda:0')
        targets = d['labels'].to("cuda:0")
        zscore = d['zscore'].to("cuda:0")
        
        # Lo usamos como input para el modelo y obtenemos el output
        outputs = model(zscore = zscore, input_ids = input_ids, attention_mask = attention_mask, biochem_info = biochem_info)

        # La predicción es la clase con mayor logit
        preds = torch.argmax(outputs.logits, dim = 1)
                
        # Guardamos la prediccion y la etiqueta real, para luego calcular metricas
        labels += targets.tolist()
        predictions += preds.tolist()
                
        # Calculamos el loss
        loss = compute_loss(loss_fn, outputs, targets)
        losses.append(loss.item())
        
        # Calculamos la media movil del loss
        mobile_loss = MOBILE_COEF*mobile_loss + (1-MOBILE_COEF)*loss.item()
        
        # Hacemos el backprop
        loss.backward()
        
        # Clip the gradients of the model to prevent exploding gradients using clip_grad_norm
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        # Medimos de nuevo
        end = process_time_ns()
        step_time = (end - start) // (10 ** 6)
        remaining_min = (step_time*(len(data_loader) - i) // (10 ** 3)) // 60
        remaining_sec = (step_time*(len(data_loader) - i) // (10 ** 3)) - remaining_min * 60

        # Imprimimos si es necesario
        if verbose:
            if i % 10 == 0:
                print(f"Step {i}/{len(data_loader)}: Loss (avg) {mobile_loss}, Step Time {step_time} ms, ETA {remaining_min}:{remaining_sec}")

    return labels, predictions, losses


def eval_biochemLL_model(model, data_loader, loss_fn, verbose = False):
    """
        Evalua un modelo con un conjunto de datos de test
        
        model         -- El modelo a entrenar
        data_loader   -- un dataloader con los ejemplos de entrenamiento
        loss_fn       -- La funcion de loss (MSE, CrossEntropy, etc.)
        verbose       -- True para mostrar informacion del entrenamiento por consola
    """
    model = model.eval()
    labels = []
    predictions = []
    
    # Variables para calcular una media del loss (no afecta al entrenamiento)
    mobile_loss = 0
    MOBILE_COEF = 0.9

    with torch.no_grad():
        i = 0
        for d in data_loader:
            # Medimos el tiempo
            i = i + 1
            start = process_time_ns()

            # Obtenemos los atributos del siguiente batch
            input_ids = d['input_ids'].to("cuda:0")
            attention_mask = d['attention_mask'].to("cuda:0")
            biochem_info = d['biochem_info'].to('cuda:0')
            targets = d['labels'].to("cuda:0")
        
            # Lo usamos como input para el modelo y obtenemos el output
            outputs = model(input_ids = input_ids, attention_mask = attention_mask, biochem_info = biochem_info)

            # La predicción es la clase con mayor logit
            preds = torch.argmax(outputs.logits, dim = 1)
                
            # Guardamos la prediccion y la etiqueta real, para luego calcular metricas
            labels += targets.tolist()
            predictions += preds.tolist()
            
            # Calculamos el loss
            loss = compute_loss(loss_fn, outputs, targets)
            
            # Calculamos la media movil del loss
            mobile_loss = MOBILE_COEF*mobile_loss + (1-MOBILE_COEF)*loss.item()
            
            # Medimos de nuevo
            end = process_time_ns()
            step_time = (end - start) // (10 ** 6)
            remaining_min = (step_time*(len(data_loader) - i) // (10 ** 3)) // 60
            remaining_sec = (step_time*(len(data_loader) - i) // (10 ** 3)) - remaining_min * 60
    
            # Imprimimos si es necesario
            if verbose:
                if i % 10 == 0:
                    print(f"Step {i}/{len(data_loader)}: Loss (avg) {mobile_loss}, Step Time {step_time} ms, ETA {remaining_min}:{remaining_sec}")

    return labels, predictions


In [3]:
from pipeline_tools import AMP_BioChemDataLoader, train_biochem_model, eval_biochem_model, compute_metrics
from torch.optim import AdamW
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from MultiGPUModels import MultiGPUBertForPeptideLLClassification
from torch.nn import CrossEntropyLoss

BATCH_SIZE = 8
LEARNING_RATE = 5e-6
WEIGHT_DECAY = 0.01
EPOCHS = 1
            
biochem_cols = [
    "molecular_mass",
    "hydrophobic_freq",
    "hydrophilic_freq",
    "basic_freq",
    "acid_freq",
    "charge",
    "aliphatic_index",
    "average_hydrophobicity",
    "isoelectric_point"
]

train_dataloader = AMP_BioChemLLDataLoader(train_df, biochem_cols, batch_size = BATCH_SIZE)            
test_dataloader = AMP_BioChemLLDataLoader(test_df, biochem_cols, batch_size = BATCH_SIZE)            

# Copiar el modelo para entrenarlo
bert_model = BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')   
multi_gpu_bert = MultiGPUBertForPeptideLLClassification(bert_model, biochem_cols)
            
# Entrenar el modelo con esta configuracion
optimizer = AdamW(
    multi_gpu_bert.parameters(), 
    lr = LEARNING_RATE, 
    weight_decay = WEIGHT_DECAY)
            
total_steps = len(train_dataloader) * EPOCHS
            
scheduler = get_linear_schedule_with_warmup(optimizer, 
    num_warmup_steps = 0, 
    num_training_steps = total_steps)

for i in range(EPOCHS):
    labels, predicted, _ = train_biochemLL_model(multi_gpu_bert, train_dataloader, CrossEntropyLoss(), optimizer, scheduler, True)

# Obtener las métricas de entrenamiento
train_metrics = compute_metrics(labels, predicted)
            
print(f"Metrics for train set: ")
print(train_metrics)

# Obtener las métricas de validacion
test_labels, test_preds = eval_biochemLL_model(multi_gpu_bert, test_dataloader, CrossEntropyLoss(), True)
test_metrics = compute_metrics(test_labels, test_preds)
            
print(f"Metrics for test set: ")
print(test_metrics)

train_metrics.to_csv('./biochem_results/train_metrics_with_biochem_LL_1ep.csv')
test_metrics.to_csv('./biochem_results/test_metrics_with_biochem_LL_1ep.csv')

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init

Step 10/445: Loss (avg) 0.45738496044937726, Step Time 705 ms, ETA 5:6
Step 20/445: Loss (avg) 0.6204272236780454, Step Time 715 ms, ETA 5:3
Step 30/445: Loss (avg) 0.6674338246557026, Step Time 714 ms, ETA 4:56
Step 40/445: Loss (avg) 0.6670646581167752, Step Time 717 ms, ETA 4:50
Step 50/445: Loss (avg) 0.6501221942685821, Step Time 721 ms, ETA 4:44
Step 60/445: Loss (avg) 0.6389096301645191, Step Time 722 ms, ETA 4:37
Step 70/445: Loss (avg) 0.5979992179602636, Step Time 720 ms, ETA 4:30
Step 80/445: Loss (avg) 0.5616963876072636, Step Time 721 ms, ETA 4:23
Step 90/445: Loss (avg) 0.568687351922433, Step Time 724 ms, ETA 4:17
Step 100/445: Loss (avg) 0.5231240741537585, Step Time 724 ms, ETA 4:9
Step 110/445: Loss (avg) 0.5165157187966093, Step Time 725 ms, ETA 4:2
Step 120/445: Loss (avg) 0.5109281807616192, Step Time 726 ms, ETA 3:55
Step 130/445: Loss (avg) 0.487256991140266, Step Time 728 ms, ETA 3:49
Step 140/445: Loss (avg) 0.4837104464045119, Step Time 726 ms, ETA 3:41
Step 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

TypeError: sum(): argument 'input' (position 1) must be Tensor, not NoneType

In [1]:
import torch
import torch.nn as nn

class CNNBiLSTM(nn.Module):
    def __init__(self, num_preds):
        r"""
            Copia un encoder, dividiendolo en dos partes, que van a sendas GPUs.
        """
        super().__init__()
        self.conv = torch.nn.Conv1d(
            in_channels = num_preds, 
            out_channels = 512,
            kernel_size = 100
        )
        self.lstm = torch.nn.LSTM(
            input_size = 512,
            hidden_size = 512,
            bidirectional = True
        )
        
        
    def forward(
        self,
        aa_preds: torch.Tensor,
    ) -> torch.Tensor:
    
        output = self.conv(aa_preds)
        output = output.transpose(0,1)
        output = self.lstm(output)
        
        return output

In [2]:
import torch

a = torch.tensor([1])

In [3]:
mymodel = CNNBiLSTM(133).to("cuda:1")

samples = [torch.rand(133,200) for i in range(20000)]

for i in range(20000):
    mymodel(samples[i].to("cuda:1"))

In [4]:
import torch
from transformers import BertForSequenceClassification
from MultiGPUModels import MultiGPUBertModel

class MultiGPUBertForPeptideAAClassification(torch.nn.Module):
    def __init__(self, bert_model_for_class, biochem_global_cols, biochem_aa_cols):
        super().__init__()
                
        self.num_labels = bert_model_for_class.num_labels

        self.bert = MultiGPUBertModel(bert_model_for_class.bert)
        self.bert_dropout = nn.Dropout(0.01).to("cuda:0")
        
        self.biochem_aa_cols = biochem_aa_cols
        self.cnn_rnn = nn.CNNBiLSTM(len(biochem_aa_cols)).to("cuda:1")
        self.cnn_rnn_dropout = nn.Dropout(0.01).to("cuda:1")

        self.biochem_global_cols = biochem_global_cols
        
        self.classifier = nn.Linear(
            self.config.hidden_size + len(self.biochem_global_cols) + len(self.biochem_aa_cols), 
            self.num_labels
        ).to("cuda:0")
        
    def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            biochem_global_info: torch.Tensor = None,
            biochem_aa_info: torch.Tensor = None,
        ) -> SequenceClassifierOutput:
            r"""
                Se entrena por un lado el Encoder, con los input_ids, as attention_mask y todo eso
                
                Por otro lado, la BiLSTM se entrena con la informacion por aminoácido
                
                Finalmente se agregan los hidden_states de ambos componentes junto con los predictores
                globales para clasificar y obtener las probabilidades para cada clase
            """
    
            # Primera parte: embeddings del encoder
    
            # Obtengo los embeddings generados por el encoder 
            bert_outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
            )
            bert_pooled_output = outputs[1]
    
            # Aplico dropout sobre este embedding
            bert_dropout_output = self.dropout(bert_pooled_output)
            
            
            # Segunda parte: cell info del CNN_BiLSTM
        
            # Proceso los predictores por aminoácido
            cnn_rnn_output = self.cnn_rnn(biochem_aa_info)
            
            # Me quedo con el cell_state, que representa toda la secuencia
            cell_state = cnn_rnn_output[1][1]
            
            # De nuevo aplico dropout
            cnn_rnn_dropout_output = self.cnn_rnn_dropout(cell_state)
            
            
            # Concateno toda la información
            output_with_biochem = torch.cat([
                bert_dropout_output,
                cnn_rnn_dropout_output, 
                biochem_global_info
            ], dim = 1)
            
            # Clasifico 
            logits = self.classifier(output_with_biochem)
            
            return SequenceClassifierOutput(
                loss=None,
                logits=logits,
                hidden_states=pooled_output,
                attentions=outputs.attentions,
            )

NameError: name 'BertForSequenceClassification' is not defined

In [None]:
class AMP_BioChemDataset(Dataset):
    """
        Esta clase permite formar un Dataset legible para los modelos de PyTorch
        Implementa los métodos necesarios para entrenar un BERT
    """
    def __init__(self, df, biochem_cols, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        super(Dataset, AMP_BioChemDataset).__init__(self)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.df = df
        self.max_len = max_len
        
        self.seqs = list(df['aa_seq'])
        self.biochem = df[biochem_cols]
        self.labels = list(df['AMP'].astype(int))
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_enc = self.tokenizer(
            seq, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len,
            return_tensors = 'pt',
            return_attention_mask=True
        )
        seq_label = self.labels[idx]
        seq_biochem = self.biochem.iloc[idx]
        seq_biochem.loc['molecular_mass'] = seq_biochem['molecular_mass'] / 1e4
        seq_biochem.transpose()
        
        return {
            'idx': idx,
            'input_ids': seq_enc['input_ids'].flatten(),
            'attention_mask' : seq_enc['attention_mask'].flatten(),
            'labels' : torch.tensor(seq_label, dtype=torch.long),
            'biochem_info': torch.tensor(seq_biochem, dtype=torch.float32)
        }
    

class AMP_BioChemDataLoader(DataLoader):
    """
        Es una estructura de datos iterable con mini-batches de datos
    
        dataframe   --  Un dataframe de Pandas con los datos, con columnas 'aa_seq' y 'AMP'
        batch_size  --  El tamaño de mini-batch con el que vas a entrenar el modelo   
    """
    def __init__(self, dataframe, biochem_cols, batch_size):
        DataLoader.__init__(
            self,
            AMP_BioChemDataset(dataframe, biochem_cols),
            batch_size = batch_size,
            num_workers = 2,
            shuffle = True
        )
