In [1]:
import torch
import torch.nn as nn

import util

from dataclasses import asdict, dataclass

from datasets import load_dataset
from tqdm import trange
from transformers import AutoTokenizer, GPT2Config, GPT2Model, GPT2LMHeadModel, AutoConfig
from transformers import DataCollatorForLanguageModeling

In [45]:
def prompt(text: str) -> dict[str, str]:
    """SMILES prompt."""

    try:
        smiles = util.canonicalize_smiles(text["text"])
        return {"prompt": f"{util.maccs_fingerprint(smiles).ToBitString()}\n{smiles}"}
    except TypeError:
        return {"prompt": ""}


data = load_dataset("csv", data_files={"train": "SMILES.csv"}).map(prompt)

print(f"Dataset Size: {len(data['train']['prompt']):_}")

Dataset Size: 1_000


In [4]:
@dataclass(frozen=True)
class TransformerConfig:
    """A transformer configuration."""

    n_embd: int
    n_head: int
    n_layer: int

    @property
    def dict(self) -> dict[str, int | bool]:
        """Returns a dictionary representation of the config."""

        return asdict(self)


CONFIGS: dict[str, TransformerConfig] = {
    "pico": TransformerConfig(n_embd=32, n_head=1, n_layer=1),
    "tiny": TransformerConfig(n_embd=64, n_head=2, n_layer=3),
    "small": TransformerConfig(n_embd=128, n_head=4, n_layer=6),
    "standard": TransformerConfig(n_embd=256, n_head=8, n_layer=12),
}

n_positions = 512
config = CONFIGS["tmp"]
tokenizer = AutoTokenizer.from_pretrained("gpt2")
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

model = GPT2LMHeadModel(
    GPT2Config(
        **(
            config.dict
            | dict(
                n_positions=n_positions,
                vocab_size=len(tokenizer),
                n_ctx=n_positions,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        )
    )
)


def tokenize(elem: dict[str, str]) -> str:
    outputs = tokenizer(
        elem["prompt"],
        truncation=True,
        max_length=n_positions,
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length <= n_positions:
            input_batch.append(input_ids)

    return {"input_ids": input_batch}


tokenized_data = data.map(tokenize, batched=True, remove_columns=["text", "prompt"])
tokenizer.pad_token = tokenizer.eos_token

params = sum(param.numel() for param in model.parameters())
print(f"{params:_}")

101_616


In [5]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="maccs_models",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    # logging_steps=5_000,
    report_to="none",
    num_train_epochs=1,
    weight_decay=0.1,
    learning_rate=1e-4,
    save_steps=5_000,
    fp16=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_data["train"],
)

In [6]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 780.00 MiB (GPU 0; 3.81 GiB total capacity; 1.53 GiB already allocated; 788.69 MiB free; 2.36 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF