# MODEL TEST

In [4]:
## testing
# import torch
# from transformers import T5EncoderModel, T5Tokenizer
# DEVICE = 'cuda:0'
# MODEL_NAME = 'Rostlab/prot_t5_xl_half_uniref50-enc'
# model = T5EncoderModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(DEVICE)
# tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)

In [5]:
# DEVICE = 'cuda:0'
# i = model.t5model.dummy_inputs
# model.t5model.to(DEVICE)
# i.pop('decoder_input_ids')
# i.pop('decoder_attention_mask')
# for k in i:
#     i[k] = i[k].to(DEVICE)
# o = model.t5model(**i, output_hidden_states=True)
# logits = model.linear(o.last_hidden_state.mean(1).view(-1, model.t5model.config.d_model))
# logits.shape
# loss_fct = nn.CrossEntropyLoss()
# labels = torch.tensor([[0],[0],[1]]).to(DEVICE)
# loss = loss_fct(logits.view(-1, model.num_labels), labels.view(-1))
# loss

# Define model & tokenizer

In [6]:
import os
import torch
import transformers
from torch import nn
from transformers import T5EncoderModel, T5Tokenizer
from transformers.modeling_outputs import SequenceClassifierOutput

torch.set_default_dtype(torch.half)
torch.set_default_tensor_type(torch.HalfTensor)
transformers.logging.set_verbosity_error()

# Select visible gpus
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'

MODEL_NAME = 'Rostlab/prot_t5_xl_half_uniref50-enc'
DEVICE = 'cuda:0'
OUTPUT_DIR = f'{MODEL_NAME.split("/")[1]}_finetune'
TRAIN_EPOCHS = 10
RESUME_FROM_CHECKPOINT=False

class T5EncoderForBinaryClassification(nn.Module):
    def __init__(self, t5model_conifg:str, **kwargs): 
        super(T5EncoderForBinaryClassification,self).__init__() 
        self.num_labels = 2
        self.t5model = T5EncoderModel.from_pretrained(t5model_conifg, **kwargs)
        # for param in self.t5model.parameters():
        #     param.requires_grad = False
        self.dropout = nn.Dropout(0.5) 
        self.linear = nn.Linear(self.t5model.config.d_model, 2)
        self.config = self.t5model.config

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.t5model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = self.dropout(outputs.last_hidden_state)

        logits = self.linear(last_hidden_state.mean(1).view(-1, self.t5model.config.d_model))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.last_hidden_state,
            attentions=outputs.attentions
        )

model = T5EncoderForBinaryClassification(MODEL_NAME, torch_dtype=torch.float16).to(DEVICE)

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
tokenizer.add_tokens(['<epitope_left>', '<epitope_right>']) # Add tokens

model.t5model.resize_token_embeddings(len(tokenizer))

print('output_dir:', OUTPUT_DIR)

output_dir: prot_t5_xl_half_uniref50-enc_finetune


# Define dataset

In [7]:
TRAIN_SET_PATH = 'T5_dataset/enc/train'
VALID_SET_PATH = 'T5_dataset/enc/valid'
TEST_SET_PATH = 'T5_dataset/enc/test'
import datasets
from datasets import load_dataset
datasets.logging.set_verbosity_error()

SEED=42

if not (os.path.exists(TRAIN_SET_PATH) or os.path.exists(VALID_SET_PATH) or os.path.exists(TEST_SET_PATH)):

    raw_train_dataset = load_dataset(
        'csv', data_files='train.csv'
    )['train'].shuffle(seed=SEED).train_test_split(test_size=0.1)

    raw_test_dataset = load_dataset(
        'csv', data_files={'test': 'test.csv'},
        split='test',
    )

    print(raw_train_dataset)
    print(raw_test_dataset)
    
else: 
    print('Preprocessed dataset file already exists.')

Preprocessed dataset file already exists.


# Dataset Preprocessing

In [8]:
import datasets
from torch import tensor
datasets.logging.disable_progress_bar()

WINDOW = 50

def dataset_preproc(dataset: datasets.Dataset, num_proc=4):
    if 'label' in dataset.column_names:
        dataset = dataset.map(
            lambda x: {'labels': torch.tensor([x['label']], dtype=torch.long)},
            num_proc=num_proc,
        )
    dataset = dataset.map(
        lambda x: tokenizer(
                ' '.join(x['antigen_seq'][max(0, int(x['start_position']) - WINDOW - 1): int(x['start_position']) - 1]) +
                '<epitope_left>'+
                ' '.join(x['epitope_seq']) +
                '<epitope_right>' +
                ' '.join(x['antigen_seq'][int(x['end_position']): min(len(x['antigen_seq']), int(x['end_position']) + WINDOW)])
            ),
        num_proc=num_proc,
    ).map(lambda x: {'input_len': len(x['input_ids'])}).sort('input_len', reverse=True)
    dataset = dataset.remove_columns(list(set(dataset.column_names) - set(['input_ids', 'labels'])))
    return dataset

train_dataset, valid_dataset, test_dataset = None, None, None

if not (os.path.exists(TRAIN_SET_PATH) or os.path.exists(VALID_SET_PATH)):
    train_dataset = dataset_preproc(raw_train_dataset['train'], num_proc=16)
    train_dataset.save_to_disk(TRAIN_SET_PATH)
    valid_dataset = dataset_preproc(raw_train_dataset['test'], num_proc=16)
    valid_dataset.save_to_disk(VALID_SET_PATH)
else:
    train_dataset = datasets.load_from_disk(TRAIN_SET_PATH)
    valid_dataset = datasets.load_from_disk(VALID_SET_PATH)

if not os.path.exists(TEST_SET_PATH):
    test_dataset = dataset_preproc(raw_test_dataset, num_proc=16)
    test_dataset.save_to_disk(TEST_SET_PATH)
else:
    test_dataset = datasets.load_from_disk(TEST_SET_PATH)

print('train:', train_dataset)
print('valid:', valid_dataset)
print('test:', test_dataset)

train: Dataset({
    features: ['labels', 'input_ids'],
    num_rows: 171729
})
valid: Dataset({
    features: ['labels', 'input_ids'],
    num_rows: 19082
})
test: Dataset({
    features: ['input_ids'],
    num_rows: 120944
})


# Define trainer & Train

In [9]:
import os
import torch
import gc

import transformers
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import f1_score

transformers.logging.set_verbosity_info()

## Wipe memory
gc.collect()
torch.cuda.empty_cache()

F1_SCORE = 'f1-score'

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=1e-5,
    adam_epsilon=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    # eval_accumulation_steps=20,
    dataloader_num_workers=32,
    evaluation_strategy='epoch',
    report_to='none',
    logging_strategy='epoch',
    log_level='warning',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_private_repo=True,
    hub_model_id=f'AutoML/{OUTPUT_DIR}',
    auto_find_batch_size=True,
)

data_collator = DataCollatorWithPadding(
    tokenizer,
)

print('output_dir:', OUTPUT_DIR)

def f1_score_metrics(eval_pred):
    # print(eval_pred.predictions)
    y_pred = eval_pred.predictions[0].argmax(-1)
    y_true = eval_pred.label_ids
    score = f1_score(y_true, y_pred, average='macro')
    return {F1_SCORE: score}

trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=f1_score_metrics,
)

train_output = trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)
trainer.push_to_hub()

PyTorch: setting up devices


output_dir: prot_t5_xl_half_uniref50-enc_finetune


/userHome/hanyong/DACON-2022-AI-challenge/prot_t5_xl_half_uniref50-enc_finetune is already a clone of https://huggingface.co/AutoML/prot_t5_xl_half_uniref50-enc_finetune. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,F1-score
1,0.1454,0.319336,0.477263
2,0.1838,0.436035,0.477263
3,0.1908,0.44751,0.478342
4,0.1908,0.442627,0.478327
5,0.1908,0.433838,0.478875
6,0.1908,0.408691,0.478829
7,0.1854,0.41748,0.478783
8,0.185,0.425049,0.47886
9,0.1891,0.415771,0.478829
10,0.1849,0.415527,0.478844


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/2.25G [00:00<?, ?B/s]

To https://huggingface.co/AutoML/prot_t5_xl_half_uniref50-enc_finetune
   f1a3ecd..d36a659  main -> main



AttributeError: 'T5EncoderForBinaryClassification' object has no attribute 'config'