In [4]:
import pandas as pd

SEED = 0
FRAC_VAL = 0.2

train_df = pd.read_csv('./datasets/all_veltri_2.csv', index_col = 0).sample(frac=1, random_state=SEED)
df_pos = pd.read_csv('./datasets/veltri_dramp_cdhit_90_2.csv', index_col = 0)
df_neg = pd.read_csv('./datasets/non_amp_ampep_cdhit90_2.csv', index_col = 0)

val_df_pos = df_pos.sample(frac=FRAC_VAL, random_state=SEED)
val_df_neg = df_neg.sample(frac=FRAC_VAL, random_state=SEED)
test_df_pos = df_pos.drop(val_df_pos.index)
test_df_neg = df_neg.drop(val_df_neg.index)

val_df = pd.concat([val_df_pos, val_df_neg]).sample(frac=1, random_state=SEED)
test_df = pd.concat([test_df_pos, test_df_neg]).sample(frac=1, random_state=SEED)

print(f"Validation dataframe: {len(val_df_pos)} positives, {len(val_df_neg)} negatives ({len(val_df_pos)/len(val_df)}%)")
print(f"Test dataframe: {len(test_df_pos)} positives, {len(test_df_neg)} negatives ({len(test_df_pos)/len(test_df)}%)")

Validation dataframe: 413 positives, 382 negatives (0.519496855345912%)
Test dataframe: 1652 positives, 1526 negatives (0.5198237885462555%)


In [5]:
from pipeline_tools import train_model, eval_model, compute_metrics
from torch.optim import AdamW
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

BATCH_SIZE = 8
LEARNING_RATE = 5e-6
WEIGHT_DECAY = 0.01
EPOCHS = 1
            
biochem_cols = [
    "molecular_mass",
    "hydrophobic_freq",
    "hydrophilic_freq",
    "basic_freq",
    "acid_freq",
    "charge",
    "aliphatic_index",
    "average_hydrophobicity",
    "isoelectric_point"
]

train_dataloader = AMP_BioChemDataLoader(train_df, biochem_cols, batch_size = BATCH_SIZE)            
test_dataloader = AMP_BioChemDataLoader(test_df, biochem_cols, batch_size = BATCH_SIZE)            

# Copiar el modelo para entrenarlo
bert_model = BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')   
multi_gpu_bert = MultiGPUDoubleBertForPeptideClassification(bert_model, biochem_cols)
            
# Entrenar el modelo con esta configuracion
optimizer = AdamW(
    multi_gpu_bert.parameters(), 
    lr = LEARNING_RATE, 
    weight_decay = WEIGHT_DECAY)
            
total_steps = len(train_dataloader) * EPOCHS
            
scheduler = get_linear_schedule_with_warmup(optimizer, 
    num_warmup_steps = 0, 
    num_training_steps = total_steps)

for i in range(EPOCHS):
    labels, predicted, _ = train_model(multi_gpu_bert, train_dataloader, CrossEntropyLoss(), optimizer, scheduler, True)

# Obtener las métricas de entrenamiento
train_metrics = compute_metrics(labels, predicted)
            
print(f"Metrics for train set: ")
print(train_metrics)

# Obtener las métricas de validacion
test_labels, test_preds = eval_model(multi_gpu_bert, test_dataloader, CrossEntropyLoss(), True)
test_metrics = compute_metrics(test_labels, test_preds)
            
print(f"Metrics for test set: ")
print(test_metrics)

train_metrics.to_csv('./biochem_results/train_metrics_with_biochem_LL_1ep.csv')
test_metrics.to_csv('./biochem_results/test_metrics_with_biochem_LL_1ep.csv')

{'idx': tensor([ 597,  876, 2267, 1412, 1495, 1919, 1514, 1635]), 'input_ids': tensor([[ 2, 10, 15,  ...,  0,  0,  0],
        [ 2, 17, 19,  ...,  0,  0,  0],
        [ 2, 18, 13,  ...,  0,  0,  0],
        ...,
        [ 2,  5, 10,  ...,  0,  0,  0],
        [ 2, 21, 13,  ...,  0,  0,  0],
        [ 2, 21,  6,  ...,  0,  0,  0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 0, 1, 0, 1, 0, 0]), 'biochem_global_info': tensor([[ 0.6875,  0.4100,  0.5900,  0.1500,  0.0900,  3.0000,  0.2500,  1.1900,
          8.2527],
        [ 0.1734,  0.4600,  0.5400,  0.0800,  0.2300, -2.0000,  0.0800, -0.6700,
          4.0500],
        [ 0.3977,  0.3300,  0.6700,  0.2000,  0.1700,  1.0000,  0.1800,  0.6500,
          6.0221],
        [ 0.7410,  0.2900,  0.7100,  0.1200,  0.1200,  0.0000,

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init

Step 10/445: Loss (avg) 0.45909324256595474, Step Time 742 ms, ETA 5:22
Step 20/445: Loss (avg) 0.6141726576076305, Step Time 741 ms, ETA 5:14
Step 30/445: Loss (avg) 0.6720002518053636, Step Time 745 ms, ETA 5:9
Step 40/445: Loss (avg) 0.683722405948407, Step Time 744 ms, ETA 5:1
Step 50/445: Loss (avg) 0.665407795332585, Step Time 748 ms, ETA 4:55
Step 60/445: Loss (avg) 0.6280311669210209, Step Time 750 ms, ETA 4:48
Step 70/445: Loss (avg) 0.5897950519040858, Step Time 749 ms, ETA 4:40
Step 80/445: Loss (avg) 0.569161513595918, Step Time 753 ms, ETA 4:34
Step 90/445: Loss (avg) 0.5177453564756024, Step Time 751 ms, ETA 4:26
Step 100/445: Loss (avg) 0.5384012018354685, Step Time 750 ms, ETA 4:18
Step 110/445: Loss (avg) 0.5182216736591545, Step Time 750 ms, ETA 4:11
Step 120/445: Loss (avg) 0.45306408156974337, Step Time 752 ms, ETA 4:4
Step 130/445: Loss (avg) 0.4502880856582586, Step Time 755 ms, ETA 3:57
Step 140/445: Loss (avg) 0.4452466773800974, Step Time 754 ms, ETA 3:49
Step 