### This notebook is a copy of the other one but used for finetuning the ProstT5 model. 

In [None]:
! pip install transformers[torch] evaluate datasets requests pandas scikit-learn peft bitsandbytes matplotlib sentencepiece accelerate 
#deepspeed

In [None]:
# !apt install git-lfs
# pip install wandb
# wandb login

# Fine-Tuning Protein Language Models

In [1]:
model_checkpoint = "Rostlab/ProstT5"

## Data preparation

In [4]:
import os

# Change to the desired directory
# os.chdir("/root")
# Verify the change
print(os.listdir("./"))

['.git', 'README.md', 'cath_domain_list.list', 'domain_list_nonredundant_s40_v4_3.list', 'eda.ipynb', 'foldseek_seq3d.csv', 'queryDB_ss.fasta', 'sequences_nonredundant_s40_v4_3.fa', 'train.csv', 'val.csv', 'protein_language_modeling_prostT5.ipynb', '.ipynb_checkpoints']


## Tokenizing the data

In [2]:
import pandas as pd

train_df = pd.read_csv("./train.csv")
val_df = pd.read_csv("./val.csv")

In [3]:
train_sequences = train_df['sequences'].tolist()
train_sequences_3d = train_df['seq3d'].tolist()
train_labels = train_df['label'].tolist()

test_sequences = val_df['sequences'].tolist()
test_sequences_3d = val_df['seq3d'].tolist()
test_labels = val_df['label'].tolist()


In [4]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, do_lower_case=False)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from transformers import T5Tokenizer
from datasets import Dataset
import re


def preprocess_data(sequences, structures, tokenizer):
    tokenized_sequences = []
    tokenized_structures = []

    for sequence, structure in zip(sequences, structures):
        # Preprocess sequences
        sequence = " ".join(list(re.sub(r"[UZOB]", "X", sequence)))
        structure = " ".join(list(structure))

        sequence = "<AA2fold> " + sequence if sequence.isupper() else sequence
        structure = "<fold2AA> " + structure

        # Tokenize sequences and structures
        sequence_inputs = tokenizer(
            sequence, add_special_tokens=True, padding="longest", return_tensors="pt"
        )
        structure_inputs = tokenizer(
            structure, add_special_tokens=True, padding="longest", return_tensors="pt"
        )

        tokenized_sequences.append(sequence_inputs)
        tokenized_structures.append(structure_inputs)

    return tokenized_sequences, tokenized_structures


# Preprocess and tokenize the data
train_tokenized_sequences, train_tokenized_structures = preprocess_data(
    train_sequences, train_sequences_3d, tokenizer
)
test_tokenized_sequences, test_tokenized_structures = preprocess_data(
    test_sequences, test_sequences_3d, tokenizer
)


import datasets


def create_dataset(tokenized_sequences, tokenized_structures, labels):
    input_ids_sequence = [item["input_ids"].squeeze() for item in tokenized_sequences]
    attention_mask_sequence = [
        item["attention_mask"].squeeze() for item in tokenized_sequences
    ]
    input_ids_structure = [item["input_ids"].squeeze() for item in tokenized_structures]
    attention_mask_structure = [
        item["attention_mask"].squeeze() for item in tokenized_structures
    ]

    dataset_dict = {
        "input_ids_sequence": input_ids_sequence,
        "attention_mask_sequence": attention_mask_sequence,
        "input_ids_structure": input_ids_structure,
        "attention_mask_structure": attention_mask_structure,
        "labels": labels,
    }

    return Dataset.from_dict(dataset_dict)


# Create Dataset objects
train_dataset = create_dataset(
    train_tokenized_sequences, train_tokenized_structures, train_labels
)
test_dataset = create_dataset(
    test_tokenized_sequences, test_tokenized_structures, test_labels
)

from torch.utils.data import DataLoader
import torch


class ProteinDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids_sequence": torch.tensor(item["input_ids_sequence"]),
            "attention_mask_sequence": torch.tensor(item["attention_mask_sequence"]),
            "input_ids_structure": torch.tensor(item["input_ids_structure"]),
            "attention_mask_structure": torch.tensor(item["attention_mask_structure"]),
            "labels": torch.tensor(item["labels"]),
        }


# Create custom dataset
train_dataset = ProteinDataset(train_dataset)
test_dataset = ProteinDataset(test_dataset)

## Model loading

I create a customized new model class that combines the prostT5 model outputs and a classification layer, in such a way that we can optionally choose in the finetuning to use either the sequence embeddings, the structure embeddings or an average/concat of both to feed to the classifier.

In [6]:
import torch
import torch.nn as nn
from transformers import T5EncoderModel, T5ForSequenceClassification, T5PreTrainedModel
from transformers import PretrainedConfig, T5Config


class T5ClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config: T5Config):
        super().__init__()
        self.dense = nn.Linear(config.d_model, config.d_model)
        self.dropout = nn.Dropout(p=config.classifier_dropout)
        self.out_proj = nn.Linear(config.d_model, config.num_labels)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.out_proj(hidden_states)
        return hidden_states


class CustomT5ForSequenceClassification(T5PreTrainedModel):
    def __init__(self, model_checkpoint, config):

        super().__init__(config)
        self.transformer = T5EncoderModel.from_pretrained(model_checkpoint)
        # self.classifier = nn.Linear(1024, num_labels).to(device)
        self.classification_head = T5ClassificationHead(config)

    def forward(
        self,
        input_ids_sequence,
        input_ids_structure,
        attention_mask_sequence=None,
        attention_mask_structure=None,
        labels=None,
    ):
        # here we could add an augmentation step in which we choose x% of the time the sequence only, the structure only, or both

        # Get embeddings for the sequence
        sequence_outputs = self.transformer(
            input_ids_sequence, attention_mask=attention_mask_sequence
        )
        sequence_embeddings = sequence_outputs.last_hidden_state.mean(dim=1)

        # Get embeddings for the structure
        # structure_outputs = self.transformer(
        #     input_ids_structure, attention_mask=attention_mask_structure
        # )
        # structure_embeddings = structure_outputs.last_hidden_state.mean(dim=1)

        # Combine the embeddings
        # combined_embeddings = (
        #     sequence_embeddings + structure_embeddings
        # ) / 2.0  # can be changed to concatenation but the embedding d_model in the config should be adjusted accordingly

        combined_embeddings = sequence_embeddings

        # Feed to classifier head
        logits = self.classification_head(combined_embeddings)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits


# Example usage
preconfig = PretrainedConfig.from_pretrained(model_checkpoint)
num_labels = max(train_labels + test_labels) + 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preconfig.update({"num_labels": num_labels, "classifier_dropout": 0.1})
# preconfig.update({"num_labels": num_labels, "classifier_dropout": 0.3, "dropout_rate": 0.3}) if more dropout desired

model = CustomT5ForSequenceClassification(model_checkpoint, preconfig).to(device)

You are using a model of type t5 to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


In [7]:
%env WANDB_WATCH=all
%env WANDB_SILENT=true
%env WANDB_LOG_MODEL=end
%env WANDB_PROJECT=protein cath classification




version = 1
batch_size = 32
train_epochs = 100
num_workers = 8
lr = 1e-5 

env: WANDB_WATCH=all
env: WANDB_SILENT=true
env: WANDB_LOG_MODEL=end
env: WANDB_PROJECT=protein cath classification


In [8]:
from transformers import EarlyStoppingCallback
from transformers import TrainingArguments, Trainer

early_stopping = EarlyStoppingCallback(early_stopping_patience=5)

model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    fp16=True,
    fp16_full_eval=True,
    # bf16_full_eval=True,
    # bf16=True,
    save_total_limit=1,
    gradient_checkpointing=True,
    optim="adamw_torch",
    report_to="wandb",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    logging_strategy="epoch",
    run_name=f"{model_checkpoint.split('/')[-1]}-v-{version}",
    dataloader_num_workers=num_workers,
)



In [9]:
import evaluate
import numpy as np


def compute_metrics(eval_preds):
    metric = evaluate.combine(["f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metrics = metric.compute(
        predictions=predictions, references=labels, average="weighted"
    )
    return metrics

In [10]:
from torch.utils.data.dataloader import default_collate


def custom_collate_fn(batch):
    # Extract elements
    input_ids_sequence = [item["input_ids_sequence"] for item in batch]
    attention_mask_sequence = [item["attention_mask_sequence"] for item in batch]
    input_ids_structure = [item["input_ids_structure"] for item in batch]
    attention_mask_structure = [item["attention_mask_structure"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad sequences to the maximum length in the batch
    input_ids_sequence_padded = torch.nn.utils.rnn.pad_sequence(
        input_ids_sequence, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask_sequence_padded = torch.nn.utils.rnn.pad_sequence(
        attention_mask_sequence, batch_first=True, padding_value=0
    )
    input_ids_structure_padded = torch.nn.utils.rnn.pad_sequence(
        input_ids_structure, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask_structure_padded = torch.nn.utils.rnn.pad_sequence(
        attention_mask_structure, batch_first=True, padding_value=0
    )

    labels = torch.stack(labels)

    return {
        "input_ids_sequence": input_ids_sequence_padded,
        "attention_mask_sequence": attention_mask_sequence_padded,
        "input_ids_structure": input_ids_structure_padded,
        "attention_mask_structure": attention_mask_structure_padded,
        "labels": labels,
    }


trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
    data_collator=custom_collate_fn,
)

In [11]:
trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,2.0957,1.547083,0.548896,0.61496,0.549742
2,0.8147,0.960532,0.724462,0.73405,0.721444
3,0.3128,0.934696,0.736411,0.743351,0.736183
4,0.152,1.045662,0.739156,0.745034,0.740604
5,0.0854,1.132233,0.747653,0.762495,0.74871
6,0.0522,1.254073,0.740434,0.759642,0.742815
7,0.034,1.310171,0.749363,0.757534,0.747237
8,0.0272,1.367738,0.751142,0.76105,0.747973
9,0.0172,1.401283,0.74383,0.74856,0.747973
10,0.0116,1.526976,0.753323,0.765194,0.750921


Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/0ca73f6cf92ef5a268320c697f7b940d1030f8471714bffdb6856c641b818974 (last modified on Fri Jul 12 15:11:03 2024) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/0ca73f6cf92ef5a268320c697f7b940d1030f8471714bffdb6856c641b818974 (last modified on Fri Jul 12 15:11:03 2024) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight'].


TrainOutput(global_step=3744, training_loss=0.20354116903856778, metrics={'train_runtime': 7311.208, 'train_samples_per_second': 90.86, 'train_steps_per_second': 2.845, 'total_flos': 0.0, 'train_loss': 0.20354116903856778, 'epoch': 18.0})

In [None]:
import wandb

wandb.finish()