# Double BERT Model

In [1]:
import torch
import torch.nn as nn
from typing import Optional
from transformers import BertForSequenceClassification, BertModel, BertConfig
from transformers.modeling_outputs import SequenceClassifierOutput
from MultiGPUModels import MultiGPUBertModel

class MultiGPUDoubleBertForPeptideClassification(torch.nn.Module):
    def __init__(self, bert_model_for_class, biochem_global_cols):
        super().__init__()
                
        self.num_labels = bert_model_for_class.num_labels

        self.raw_seq_bert = MultiGPUBertModel(bert_model_for_class.bert)
        self.raw_seq_dropout = nn.Dropout(0.01).to("cuda:0")
        
        self.aa_encoding_bert_config = BertConfig(
            vocab_size = 30522, 
            hidden_size = 133, 
            num_hidden_layers = 2,
            num_attention_heads = 19,
            intermediate_size = 1024,
            hidden_act = 'gelu',
            hidden_dropout_prob = 0.1,
            attention_probs_dropout_prob = 0.1,
            max_position_embeddings = 512,
            type_vocab_size = 2,
            initializer_range = 0.02,
            layer_norm_eps = 1e-12,
            pad_token_id = 0,
            position_embedding_type = 'absolute',
            use_cache = True,
            classifier_dropout = None 
        )
        
        self.aa_encoding_bert = BertModel(self.aa_encoding_bert_config).to("cuda:1")
        self.aa_encoding_dropout = nn.Dropout(0.01).to("cuda:1")

        self.biochem_global_cols = biochem_global_cols
        
        self.classifier = nn.Linear(
            bert_model_for_class.config.hidden_size + len(self.biochem_global_cols) + self.aa_encoding_bert_config.hidden_size, 
            self.num_labels
        ).to("cuda:0")
        
    def forward(
            self,
            dataloader_item = None,
            input_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            aa_encoding: Optional[torch.Tensor] = None,
            biochem_global_info: torch.Tensor = None,
        ) -> SequenceClassifierOutput:
            r"""
                Se entrena por un lado el Encoder, con los input_ids, as attention_mask y todo eso
                                
                Finalmente se agregan los hidden_states de ambos componentes junto con los predictores
                globales para clasificar y obtener las probabilidades para cada clase
            """
            
            if dataloader_item != None:
                input_ids = dataloader_item['input_ids'].to("cuda:0")
                attention_mask = dataloader_item['attention_mask'].to("cuda:0") 
                aa_encoding = dataloader_item['aa_encoding'].to("cuda:1") 
                biochem_global_info = dataloader_item['biochem_global_info'].to("cuda:0") 
    
            # Primera parte: embeddings del encoder para las secuencias en bruto
    
            # Obtengo los embeddings generados por el encoder 
            raw_seq_bert_outputs = self.raw_seq_bert(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            raw_seq_bert_pooled_output = raw_seq_bert_outputs[1]
    
            # Aplico dropout sobre este embedding
            raw_seq_bert_dropout_output = self.raw_seq_dropout(raw_seq_bert_pooled_output)
            
            
            # Segunda parte: embeddings del encoder para las codificaciones con aminoacidos
        
            # Obtengo los embeddings generados por el encoder 
            aa_encoding_bert_outputs = self.aa_encoding_bert(
                inputs_embeds=aa_encoding,
                attention_mask=attention_mask.to("cuda:1")
            )
            aa_encoding_bert_pooled_output = aa_encoding_bert_outputs[1]
    
            # Aplico dropout sobre este embedding
            aa_encoding_bert_dropout_output = self.aa_encoding_dropout(aa_encoding_bert_pooled_output).to("cuda:0")
            
            
            # Concateno toda la información
            output_with_biochem = torch.cat([
                raw_seq_bert_dropout_output,
                aa_encoding_bert_dropout_output, 
                biochem_global_info
            ], dim = 1)
            
            # Clasifico 
            logits = self.classifier(output_with_biochem)
            
            return SequenceClassifierOutput(
                loss=None,
                logits=logits,
                hidden_states=raw_seq_bert_pooled_output,
                attentions=raw_seq_bert_outputs.attentions,
            )

In [2]:
import pandas as pd
import numpy as np
from copy import deepcopy

class BioChemMapper():
    """
        This class encodes a peptide into a tensor.
        
        Each aminoacid becomes a numerical one-dimensional tensor.
    """
    def __init__(self, config_file = "./datasets/encoding_peptides_v03.csv"):
        self.encoder_df = pd.read_csv(config_file)
        self.encoder_df.set_index("aa_code", inplace=True)
        
    def map_aa_to_encoding(self, aa):
        if aa in self.encoder_df.index:
            return np.array(deepcopy(self.encoder_df.loc[aa]).to_list())
        else:
            return np.zeros(133)
        
    def encode_peptide(self, seq: str):
        """
            Returns a numpy bidimensional array
        """
        aa_list = seq.upper().split()
        aa_encoding = map(lambda x: self.map_aa_to_encoding(x), aa_list)
        return np.array(list(aa_encoding))

    def __call__(self, seq: str):
        return self.encode_peptide(seq)
    
    def get_col_names(self):
        return self.encoder_df.columns.to_list()

In [3]:
seq = "A B C D E"
print(BioChemEncoder()(seq).shape[0])

5


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

class AMP_BioChemDataset(Dataset):
    """
        Esta clase permite formar un Dataset legible para los modelos de PyTorch
        Implementa los métodos necesarios para entrenar un BERT
    """
    def __init__(self, df, biochem_cols, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        super(Dataset, AMP_BioChemDataset).__init__(self)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.df = df
        self.max_len = max_len
        self.seqs = list(df['aa_seq'])
        self.biochem_cols = biochem_cols
        if "molecular_mass" in self.biochem_cols:
            self.df.loc[:,'molecular_mass'] = self.df.loc[:,'molecular_mass'] / 1e4

        self.labels = list(df['AMP'].astype(int))
        self.aa_encoder = BioChemEncoder()
        

        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_enc = self.tokenizer(
            seq, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len,
            return_tensors = 'pt',
            return_attention_mask=True
        )
        seq_label = self.labels[idx]
        seq_biochem = self.df.iloc[idx].loc[biochem_cols]
        seq_biochem.transpose()
        
        encoded_seq = self.aa_encoder(seq)
        h = encoded_seq.shape[0]

        if h < self.max_len:
            padding = np.array([np.zeros(133).reshape(-1) for i in range(h,self.max_len)])
            aa_encoding = np.vstack((encoded_seq, padding))
        else:
            aa_encoding = encoded_seq[:self.max_len]
        
        return {
            'idx': idx,
            'input_ids': seq_enc['input_ids'].flatten(),
            'attention_mask' : seq_enc['attention_mask'].flatten(),
            'labels' : torch.tensor(seq_label, dtype=torch.long),
            'biochem_global_info': torch.tensor(seq_biochem, dtype=torch.float32),
            'aa_encoding': torch.tensor(aa_encoding, dtype=torch.float32)
        }
    

class AMP_BioChemDataLoader(DataLoader):
    """
        Es una estructura de datos iterable con mini-batches de datos
    
        dataframe   --  Un dataframe de Pandas con los datos, con columnas 'aa_seq' y 'AMP'
        batch_size  --  El tamaño de mini-batch con el que vas a entrenar el modelo   
    """
    def __init__(self, dataframe, biochem_cols, batch_size):
        DataLoader.__init__(
            self,
            AMP_BioChemDataset(dataframe, biochem_cols),
            batch_size = batch_size,
            num_workers = 2,
            shuffle = True
        )


In [5]:
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from time import process_time_ns 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from pandas import DataFrame
from itertools import product
from sklearn.model_selection import StratifiedKFold
from copy import deepcopy
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup, get_inverse_sqrt_schedule
import pandas as pd
import torch

def grid_search_early_stop(model, train_data_loader, val_data_loader, grid, batch_size, loss_fn = torch.nn.CrossEntropyLoss(), verbose = False):
        
    param_combinations = product(
        grid["learning_rate"],
        grid["weight_decay"],
    )
    
    all_combs = []
    all_metrics = []
    all_losses = []
        
    # Calculamos todas las combinaciones con el grid de hiperparametros
    num_combinations = 1
    for key in grid.keys():
        num_combinations *= len(grid[key])
        
    print()
    print(f"Number of combinations: {num_combinations}")

    for combination in param_combinations:
        
        # En cada combinacion entrenamos y testeamos
        learning_rate, weight_decay = combination

        print()
        print("Next combination:")
        print(f"learning_rate: {learning_rate}")
        print(f"weight_decay: {weight_decay}")
        
        # Copiamos el modelo
        model_copy = deepcopy(model)
        
        # Preparamos el optimizador y el scheduler
        optimizer = AdamW(
            model_copy.parameters(), 
            lr = learning_rate,
            weight_decay = weight_decay,
        )
        
        #scheduler = get_inverse_sqrt_schedule(
        #    optimizer,
        #    num_warmup_steps = 10
        #)
        
        scheduler = StepLR(optimizer, step_size=10, gamma=0.9)
        
        # Entrenamos hasta la mejor época
        stop_training = False
        epochs = 0
        prev_f1 = 0.0
        
        train_start = process_time_ns()
        comb_metrics = []
        comb_losses = []
        
        while not stop_training:
            # Entrenamos una vez mas
            epochs = epochs + 1
            _, _, losses = train_model(model_copy, train_data_loader, loss_fn, optimizer, scheduler, verbose)
            
            # Medimos
            eval_start = process_time_ns()
            labels, predictions = eval_model(model_copy, val_data_loader, loss_fn, verbose)
            eval_end = process_time_ns()
            metrics = compute_metrics(labels, predictions)

            print()
            print(metrics)
            print()
            
            # Guardamos la informacion
            comb_metrics.append(metrics.to_dict())
            comb_losses.append(losses)
            
            # Comprobamos si paramos ya
            stop_training = (metrics["f1"].item() - prev_f1 < 0.01)
            prev_f1 = metrics["f1"].item()

        train_end = process_time_ns()
        
        df_comb_metrics = pd.DataFrame(comb_metrics)
        df_comb_metrics.to_csv(f"./double_bert_results/metrics_grid-lr_{learning_rate}-wd_{weight_decay}.csv")
        
        df_comb_losses = pd.DataFrame(comb_losses)
        df_comb_losses.to_csv(f"./double_bert_results/losses_grid-lr_{learning_rate}-wd_{weight_decay}.csv")
        
        metrics["train_time_secs"] = (train_end - train_start) // (10 ** 9)
        metrics["eval_time_secs"] = (eval_end - eval_start) // (10 ** 9)
        
        # Guardamos las medidas
        all_combs.append(combination)
        all_metrics.append(metrics.to_dict())
        all_losses.append(losses)
                    
        del model_copy
        
    df_combs = pd.DataFrame(all_combs, index = range(num_combinations), columns=['epochs', 'batch_size', 'learning_rate', 'betas', 'epsilon', 'weight_decay', 'warmup_steps'])
    df_metrics = pd.DataFrame(all_metrics)
    df_metrics.index = range(num_combinations)
    df_results = pd.concat([df_combs, df_metrics], axis=1)
    
    df_losses = pd.DataFrame(all_losses, index = range(num_combinations))
    
    return df_results, df_losses

In [6]:
df = pd.read_csv("./datasets/database_all_propiedades.csv").sample(frac=1, random_state=0)
df = df.rename(columns = {
    "Sequence": "aa_seq"
})
df = df.drop(df[df["Activity"] == "Unknown"].index)

In [7]:

train_df, test_df, _, _ = train_test_split(
    df, 
    df["AMP"], 
    test_size=0.2,
    random_state=0,
    stratify=df["AMP"]
)

train_df, val_df, _, _ = train_test_split(
    train_df, 
    train_df["AMP"], 
    test_size=0.2,
    random_state=0,
    stratify=train_df["AMP"]
)

biochem_cols = [
    "molecular_mass",
    "hydrophobic_freq",
    "hydrophilic_freq",
    "basic_freq",
    "acid_freq",
    "charge",
    "aliphatic_index",
    "average_hydrophobicity",
    "isoelectric_point"
]

BATCH_SIZE = 8

train_dataloader = AMP_BioChemDataLoader(train_df, biochem_cols, batch_size = BATCH_SIZE)
val_dataloader = AMP_BioChemDataLoader(val_df, biochem_cols, batch_size = BATCH_SIZE)
test_dataloader = AMP_BioChemDataLoader(test_df, biochem_cols, batch_size = BATCH_SIZE)            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [8]:
from pipeline_tools import train_model, eval_model, compute_metrics
from torch.optim import AdamW
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import pandas as pd

grid = {
    "learning_rate": [5e-5,3e-5,1e-5,1e-6],
    "weight_decay": [0.01, 0.05, 0.1, 0.2]
}

# Copiar el modelo para entrenarlo
bert_model = BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')   
multi_gpu_bert = MultiGPUDoubleBertForPeptideClassification(bert_model, biochem_cols)
            
df_results, df_losses = grid_search_early_stop(multi_gpu_bert, train_dataloader, val_dataloader, grid, BATCH_SIZE, verbose = True)

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init


Number of combinations: 16

Next combination:
learning_rate: 5e-05
weight_decay: 0.01
Step 10/1162: Loss (avg) 0.4294431668213809, Step Time 750 ms, ETA 14:24
Step 20/1162: Loss (avg) 0.6112227843352094, Step Time 748 ms, ETA 14:14
Step 30/1162: Loss (avg) 0.6533616494034847, Step Time 749 ms, ETA 14:7
Step 40/1162: Loss (avg) 0.6739434751626947, Step Time 749 ms, ETA 14:0
Step 50/1162: Loss (avg) 0.6703078347116633, Step Time 752 ms, ETA 13:56
Step 60/1162: Loss (avg) 0.6876038138806096, Step Time 753 ms, ETA 13:49
Step 70/1162: Loss (avg) 0.6526193966022601, Step Time 753 ms, ETA 13:42
Step 80/1162: Loss (avg) 0.6243786572545764, Step Time 753 ms, ETA 13:34
Step 90/1162: Loss (avg) 0.6791500430243648, Step Time 753 ms, ETA 13:27
Step 100/1162: Loss (avg) 0.6450638878891093, Step Time 755 ms, ETA 13:21
Step 110/1162: Loss (avg) 0.6290759544663682, Step Time 754 ms, ETA 13:13
Step 120/1162: Loss (avg) 0.6495091186108607, Step Time 753 ms, ETA 13:4
Step 130/1162: Loss (avg) 0.641011355

ValueError: 7 columns passed, passed data had 2 columns