In [1]:
# Import Hugging Face Transformers and Datasets
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model and tokenizer

# Check if CUDA is available and set the device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("GT4SD/multitask-text-and-chemistry-t5-small-augm")
model = AutoModelForSeq2SeqLM.from_pretrained("GT4SD/multitask-text-and-chemistry-t5-small-augm", device_map=device)

# Load and preprocess the dataset
dataset = load_dataset("language-plus-molecules/LPM-24_train")

# Move the model to the specified device
model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [3]:
# Print the first example to inspect the columns
#print(dataset.column_names)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['molecule', 'caption'],
        num_rows: 160560
    })
    split_train: Dataset({
        features: ['molecule', 'caption'],
        num_rows: 126864
    })
    split_valid: Dataset({
        features: ['molecule', 'caption'],
        num_rows: 33696
    })
})


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["molecule"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Create a smaller subset of the full dataset to fine-tune on:
small_train_dataset = tokenized_datasets["split_train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["split_valid"].shuffle(seed=42).select(range(1000))

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    split_batches=False,
    #per_device_train_batch_size=128,
    #per_device_eval_batch_size=128,
    #num_train_epochs=20,
    num_train_epochs=1,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=200,
    save_steps=10_000,
    eval_steps=500,
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    #compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Evaluate the model
""" results = trainer.evaluate()
print(results) """

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-small-model")
tokenizer.save_pretrained("./fine-tuned-small-model")