In [None]:
#@title Installs
# !pip install transformers[torch] datasets
!pip install transformers==4.39.0
!pip install galore-torch
!pip install biopython
!pip install datasets
!pip install trl

In [None]:
!head /content/pretraining_peptides_4_2.fasta

>peptide_1
MRKQRSFFASSARRREDQPHTLPNDKREILISFLVELIR
>peptide_4
MAAMIGIQLRCHPVMSVAAAAYLRVRLSPTALVGLR
>peptide_6
LFKALGLHKLHLPNTSRDSE
>peptide_8
MNTIITEENTTIRKKKKKLKKSRSSIMRNGFKSFRDK
>peptide_9
GLEHRGKLDGNQDLIR


In [None]:
#@title Create data sets
from Bio import SeqIO
import torch
import datasets
import trl
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

path_to_pretraining_data = '/content/pretraining_peptides_4_2.fasta'

# Load pretraining sequences from fasta
pretraining_sequences = []
for sequence in SeqIO.parse(path_to_pretraining_data, "fasta"):
    line = sequence.seq
    pretraining_sequences.append(str(line))

# with open(path_to_pretraining_data, 'r') as f:
#     for line in f:
#        if line[0] != '>':
#           pretraining_sequences.append(line[:-1])
# f.close()

# Randomize
random.seed(1)
random.shuffle(pretraining_sequences)

# pretraining_sequences = pretraining_sequences[:10_000]

# Split to train/test
split = round(len(pretraining_sequences)*0.9)
train_sequences = pretraining_sequences[:split]
test_sequences = pretraining_sequences[split:]

In [None]:
# Define model, tokenizer, and MLM data set object
model = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device) #, torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Create data sets
train = Dataset.from_dict({'seqs': train_sequences}) #.shuffle(seed=42) <- already shuffled
test = Dataset.from_dict({'seqs': test_sequences})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
args = TrainingArguments(
    output_dir='./drive/MyDrive/peptide_esm',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-6,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=128,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=False,
    warmup_steps=2000,
    optim="galore_adamw",
    optim_target_modules=['encoder', 'contact_head'],
    fp16=True,
    report_to='none'
    # optim_args="rank=1" #, update_proj_gap=100",
    # gradient_checkpointing=True,
    # gradient_accumulation_steps=16
)

trainer = trl.SFTTrainer(
    model=model,
    args=args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    dataset_text_field='seqs',
    max_seq_length=40,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,2.4699,2.467606
2,2.4644,2.463519
3,2.4629,2.46547


KeyboardInterrupt: 