In [1]:
import torch
import wandb
from datetime import datetime
import yaml
import os
import shutil
import numpy as np
from data.dataloader import load_data
from model.network import create_model, cri_opt_sch
from model.utils import train, validate, test
from sklearn.model_selection import train_test_split
from convert_encodings import m2

  from .autonotebook import tqdm as notebook_tqdm


### 2.1 Data Preparation

create a `csv` file with the following format:
```csv
sequence,label
AAAAAAA,1
LLLLLLL,0
CCCCCCC,0
DDDDDDD,1
```
where `sequence` is the peptide sequence and `label` is the binary label (0 or 1). Save this file as `custom_data.csv` inside the `data` directory. Now, run the following cell (edit `task_name` as desired) to convert the `csv` file to the format required by PeptideBERT.

In [2]:
task_name = 'peptidebert_MNA_equipo1'

# read data
seqs, labels = [], []
with open('./data/custom_data.csv', 'r', encoding = 'UTF-8') as f:
    for line in f.readlines()[1:]:
        seq, label = line.strip().split(',')
        seqs.append(seq)
        labels.append(int(label))

MAX_LEN = max(map(len, seqs))
# convert to tokens
mapping = dict(zip(
    ['[PAD]','[UNK]','[CLS]','[SEP]','[MASK]','L',
    'A','G','V','E','S','I','K','R','D','T','P','N',
    'Q','F','Y','M','H','C','W','X','O','B','U','J','Z'],
    range(97)
))


In [3]:
pos_data, neg_data = [], []
for i in range(len(seqs)):
    seq = [mapping[c] for c in seqs[i]] 
    seq.extend([0] * (MAX_LEN - len(seq)))  # padding to max length
    if labels[i] == 1:
        pos_data.append(seq)
    else:
        neg_data.append(seq)

pos_data = np.array(pos_data)
neg_data = np.array(neg_data)

np.savez(
    f'./data/{task_name}-positive.npz',
    arr_0=pos_data
)
np.savez(
    f'./data/{task_name}-negative.npz',
    arr_0=neg_data
)

### 2.2 Train-Val-Test Split
Now, we want to combine the positive and negative samples, shuffle them and split them into 3 non-overlapping sets - train, validation, and test.

To do so, edit the `main` function inside `./data/split_augment.py` file (comment existing calls to `split_data` and add the line `split_data('REPLACE_WITH_TASK_NAME')`) and run the following cell, this will create sub-directories (inside the `data` directory) for the custom dataset and place the subsets (train, validation, test) inside it.

Additionally, if you want to augment the dataset, you can do so by editing `./data/split_augment.py` file. You can call the `augment_data` function from the `main` function like so: `augment_data('REPLACE_WITH_TASK_NAME')`.

Further, to change/experiment with the augmentation techniques applied, you can edit the `augment_data` function. Comment/uncomment the call to any of the augmentation functions (such as `random_replace`, `random_delete`, etc.) as desired, change the factor for augmentation as desired. Do keep in mind that for each augmentation applied, you have to call the `combine` function. For example, if you want to apply the `random_swap` augmentation with a `factor` of 0.2, you can add `new_inputs, new_labels = random_swap(inputs, labels, 0.2)` followed by `inputs, labels = combine(inputs, labels, new_inputs, new_labels)` to merge the augmented dataset into the original dataset.

In [4]:
def split_data(task):
    with np.load(f'./data/{task}-positive.npz') as pos,\
         np.load(f'./data/{task}-negative.npz') as neg:
        pos_data = pos['arr_0']
        neg_data = neg['arr_0']

    input_ids = np.vstack((
        pos_data,
        neg_data
    ))

    labels = np.hstack((
        np.ones(len(pos_data)),
        np.zeros(len(neg_data))
    ))

    train_val_inputs, test_inputs, train_val_labels, test_labels = train_test_split(
        input_ids, labels, test_size=0.1
    )

    train_inputs, val_inputs, train_labels, val_labels = train_test_split(
        train_val_inputs, train_val_labels, test_size=0.1
    )

    if not os.path.exists(f'./data/{task}'):
        os.mkdir(f'./data/{task}')

    np.savez(
        f'./data/{task}/train.npz',
        inputs=train_inputs,
        labels=train_labels
    )

    np.savez(
        f'./data/{task}/val.npz',
        inputs=val_inputs,
        labels=val_labels
    )

    np.savez(
        f'./data/{task}/test.npz',
        inputs=test_inputs,
        labels=test_labels
    )

In [5]:
def combine(inputs, labels, new_inputs, new_labels):
    new_inputs = np.vstack(new_inputs)
    new_labels = np.hstack(new_labels)

    inputs = np.vstack((inputs, new_inputs))
    labels = np.hstack((labels, new_labels))

    return inputs, labels


def random_replace(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        num_to_replace = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_replace, replace=False)
        ip[indices] = np.random.choice(np.arange(5, 25), num_to_replace, replace=True)

        new_inputs.append(ip)
        new_labels.append(label)

    return new_inputs, new_labels


def random_delete(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        ip = list(ip[:unpadded_len])
        num_to_delete = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_delete, replace=False)
        for i in reversed(sorted(indices)):
            ip.pop(i)
        ip.extend([0] * (200 - len(ip)))

        new_inputs.append(np.asarray(ip))
        new_labels.append(label)

    return new_inputs, new_labels


def random_replace_with_A(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        num_to_replace = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_replace, replace=False)
        ip[indices] = m2['A']

        new_inputs.append(ip)
        new_labels.append(label)

    return new_inputs, new_labels


def random_swap(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        ip = list(ip[:unpadded_len])
        num_to_swap = round(unpadded_len * factor)
        indices = np.random.choice(range(1, unpadded_len, 2), num_to_swap, replace=False)
        for i in indices:
            ip[i-1], ip[i] = ip[i], ip[i-1]
        ip.extend([0] * (200 - len(ip)))

        new_inputs.append(np.asarray(ip))
        new_labels.append(label)

    return new_inputs, new_labels


def random_insertion_with_A(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        ip = list(ip[:unpadded_len])
        num_to_insert = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_insert, replace=False)
        for i in indices:
            ip.insert(i, m2['A'])
        if len(ip) < 200:
            ip.extend([0] * (200 - len(ip)))
        elif len(ip) > 200:
            ip = ip[:200]

        new_inputs.append(np.asarray(ip))
        new_labels.append(label)

    return new_inputs, new_labels

def random_masking(sequences, mask_prob=0.15, mask_token_id=0):
    masked_sequences = np.copy(sequences)
    mask = np.random.rand(*sequences.shape) < mask_prob
    masked_sequences[mask] = mask_token_id
    return masked_sequences

In [6]:
def augment_data(task):
    with np.load(f'./data/{task}/train.npz') as train:
        inputs = train['inputs']
        labels = train['labels']

    # new_inputs1, new_labels1 = random_replace(inputs, labels, 0.02)
    # new_inputs2, new_labels2 = random_delete(inputs, labels, 0.02)
    # new_inputs3, new_labels3 = random_replace_with_A(inputs, labels, 0.02)
    new_inputs4, new_labels4 = random_swap(inputs, labels, 0.02)
    # new_inputs5, new_labels5 = random_insertion_with_A(inputs, labels, 0.02)
    #new_inputs6, new_labels6 = random_masking(inputs, mask_prob=0.15, mask_token_id=0)

    # inputs, labels = combine(inputs, labels, new_inputs1, new_labels1)
    # inputs, labels = combine(inputs, labels, new_inputs2, new_labels2)
    # inputs, labels = combine(inputs, labels, new_inputs3, new_labels3)
    inputs, labels = combine(inputs, labels, new_inputs4, new_labels4)
    # inputs, labels = combine(inputs, labels, new_inputs5, new_labels5)
    #inputs, labels = combine(inputs, labels, new_inputs6, new_labels6)

    np.savez(
        f'./data/{task}/train.npz',
        inputs=inputs,
        labels=labels
    )


In [7]:
split_data('hemo')
split_data('sol')
split_data('nf')
# augment_data('sol')


### 2.3 Model Config
Edit the `config.yaml` file and set the `task` parameter to `REPLACE_WITH_TASK_NAME`.

Additionally, If you want to tweak the model before training, you can do so by editing `./model/network.py` and `config.yaml` files. `./model/network.py` contains the actual architecture of the model as well as the optimizer and scheduler used to train the model. `config.yaml` contains all the hyperparameters used for training, as well as which dataset to train on.

### 2.4 Training
Now we are ready to train our model. Run the following cell to start the training procedure. This will save a checkpoint of the best model (on validation set) inside the `checkpoints` directory

In [None]:
#wandb.login() 

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}\n')

Device: cuda



In [10]:
def train_model():
    print(f'{"="*30}{"TRAINING":^20}{"="*30}')

    best_acc = 0
    for epoch in range(config['epochs']):
        train_loss = train(model, train_data_loader, optimizer, criterion, scheduler, device)
        curr_lr = optimizer.param_groups[0]['lr']
        print(f'Epoch {epoch+1}/{config["epochs"]} - Train Loss: {train_loss}\tLR: {curr_lr}')
        val_loss, val_acc = validate(model, val_data_loader, criterion, device)
        print(f'Epoch {epoch+1}/{config["epochs"]} - Validation Loss: {val_loss}\tValidation Accuracy: {val_acc}\n')
        scheduler.step(val_acc)
        #if not config['debug']:
        #    wandb.log({
        #        'train_loss': train_loss, 
        #        'val_loss': val_loss, 
        #        'val_accuracy': val_acc, 
        #        'lr': curr_lr
        #    })

        if val_acc >= best_acc and not config['debug']:
            best_acc = val_acc
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_loss': train_loss,
                'val_loss': val_loss,
                'acc': val_acc, 
                'lr': curr_lr
            }, f'{save_dir}/model.pt')
            print('Model Saved\n')
    #wandb.finish()

In [11]:
#device = 'cpu'
config = yaml.load(open('./config.yaml', 'r'), Loader=yaml.FullLoader)
config['device'] = device

train_data_loader, val_data_loader, test_data_loader = load_data(config)
config['sch']['steps'] = len(train_data_loader)

model = create_model(config)
criterion, optimizer, scheduler = cri_opt_sch(config, model)

Batch size:  32
Train dataset samples:  14946
Validation dataset samples:  1661
Test dataset samples:  1846
Train dataset batches:  468
Validation dataset batches:  52
Test dataset batches:  58



  return torch.load(checkpoint_file, map_location="cpu")


In [12]:
if not config['debug']:
    run_name = f'{config["task"]}-{datetime.now().strftime("%m%d_%H%M")}'
    #wandb.init(project='PeptideBERT', name=run_name)

    save_dir = f'./checkpoints/{run_name}'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    shutil.copy('./config.yaml', f'{save_dir}/config.yaml')
    shutil.copy('./model/network.py', f'{save_dir}/network.py')

In [13]:
train_model()
if not config['debug']:
    model.load_state_dict(torch.load(f'{save_dir}/model.pt')['model_state_dict'], strict=False)
test_acc = test(model, test_data_loader, device)
print(f'Test Accuracy: {test_acc}%')



100%|██████████| 468/468 [1:36:32<00:00, 12.38s/it]


Epoch 1/3 - Train Loss: 0.6587386612708752	LR: 1e-05


100%|██████████| 52/52 [00:53<00:00,  1.02s/it]


Epoch 1/3 - Validation Loss: 0.6335006665724975	Validation Accuracy: 63.576158940397356

Model Saved



100%|██████████| 468/468 [1:33:26<00:00, 11.98s/it]


Epoch 2/3 - Train Loss: 0.6404607272428325	LR: 1e-05


100%|██████████| 52/52 [00:54<00:00,  1.05s/it]


Epoch 2/3 - Validation Loss: 0.6280845862168533	Validation Accuracy: 64.35881998795907

Model Saved



100%|██████████| 468/468 [1:32:27<00:00, 11.85s/it]


Epoch 3/3 - Train Loss: 0.6329817889719946	LR: 1e-05


100%|██████████| 52/52 [00:52<00:00,  1.01s/it]


Epoch 3/3 - Validation Loss: 0.6127537024708894	Validation Accuracy: 65.32209512341961

Model Saved



  model.load_state_dict(torch.load(f'{save_dir}/model.pt')['model_state_dict'], strict=False)
100%|██████████| 58/58 [01:00<00:00,  1.04s/it]

Test Accuracy: 63.92199349945828%



