In [93]:
#!/usr/bin/env python
import torch

import util

from datasets import Dataset, load_dataset

from tqdm import tqdm

from transformers import AutoConfig, AutoTokenizer, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import set_seed


set_seed(1_337)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True


DATASET_SIZE: int = 4_000_000
N_POSITIONS: int = 512
N_LAYER = 6
N_HEAD = 8
N_EMBD = 128


def promptify(row: dict[str, str]) -> str:
    try:
        smiles = util.canonicalize_smiles(row["smiles"])
        bitstr = " ".join(list(util.maccs_fingerprint(smiles).ToBitString()))
        prompt = f"{bitstr}\n{smiles}"
        return {"prompt": prompt}
    except:
        return {"prompt": ""}


pubchem_smiles = (
    load_dataset(
        "csv",
        delimiter="\t",
        column_names=["id", "smiles"],
        data_files=["CID-SMILES"],
        streaming=True,
    )
    .map(promptify)
    .filter(lambda row: len(row["prompt"]) > 0)
    .remove_columns(["id", "smiles"])
)

In [100]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")


def tokenize(elem: dict[str, str]) -> dict[str, list[int]]:
    out = tokenizer(
        elem["prompt"],
        truncation=True,
        max_length=N_POSITIONS,
        return_overflowing_tokens=True,
        return_length=True,
    )

    return {"input_ids": out["input_ids"]}


tok_dataset = pubchem_smiles.map(tokenize, batched=True).remove_columns(["prompt"])

In [None]:
dataset = Dataset.from_dict({"prompt": read("CID-SMILES", DATASET_SIZE)})

tokenizer = AutoTokenizer.from_pretrained("gpt2")


def tokenize(elem: dict[str, str]) -> dict[str, list[int]]:
    out = tokenizer(
        elem["prompt"],
        truncation=True,
        max_length=N_POSITIONS,
        return_overflowing_tokens=True,
        return_length=True,
    )

    return {"input_ids": out["input_ids"]}


# tok_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)


# from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

# config = AutoConfig.from_pretrained(
#     "gpt2",
#     n_positions=N_POSITIONS,
#     n_embd=N_EMBD,
#     n_head=N_HEAD,
#     n_layer=N_LAYER,
#     vocab_size=len(tokenizer),
#     pad_token_id=tokenizer.pad_token_id,
#     bos_token_id=tokenizer.bos_token_id,
#     eos_token_id=tokenizer.eos_token_id,
# )

# model = GPT2LMHeadModel(config)
# model_size = sum(p.numel() for p in model.parameters() if p.requires_grad_)
# print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

# tokenizer.pad_token = tokenizer.eos_token
# data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# args = TrainingArguments(
#     output_dir="maccs_models",
#     per_device_train_batch_size=32,
#     logging_steps=100,
#     gradient_accumulation_steps=8,
#     num_train_epochs=4,
#     weight_decay=0.1,
#     warmup_steps=1_000,
#     lr_scheduler_type="cosine",
#     learning_rate=1e-4,
#     save_steps=1_000,
#     fp16=True,
# )

# trainer = Trainer(
#     model=model,
#     tokenizer=tokenizer,
#     args=args,
#     data_collator=data_collator,
#     train_dataset=tok_dataset["input_ids"],
# )

# trainer.train()
# trainer.save_model("maccs_models/final_model")

In [1]:
import torch

import util

from datasets import Dataset

from tqdm import tqdm

from transformers import AutoConfig, AutoTokenizer, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
DATASET_SIZE: int = 10_000
N_POSITIONS: int = 256
N_LAYER = 6  # Number of transformer layers
N_HEAD = 8  # Number of multi-head attention heads
N_EMBD = 256  # Embedding size

In [None]:
def read(path: str, size: int) -> list[str]:
    """Reads SMILES strings from PubChem."""

    data = []
    with open(path) as file, tqdm(total=size, desc=f"Reading {path}...") as pbar:
        while (line := file.readline()) and len(data) < size:
            smiles = line.split()[1]
            smiles = util.canonicalize_smiles(smiles)
            bitstr = " ".join(list(util.maccs_fingerprint(smiles).ToBitString()))
            prompt = f"{bitstr}\n{smiles}"
            data.append(prompt)
            pbar.update(1)
    assert len(data) == size

    return data


dataset = Dataset.from_dict({"prompt": read("CID-SMILES", DATASET_SIZE)})

In [None]:
# READ THIS: https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt
# AND THIS: https://huggingface.co/blog/juancopi81/using-hugging-face-to-train-a-gpt-2-model-for-musi

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")


def tokenize(elem: dict[str, str]) -> dict[str, list[int]]:
    out = tokenizer(
        elem["prompt"],
        truncation=True,
        max_length=N_POSITIONS,
        return_overflowing_tokens=True,
        return_length=True,
    )

    return {"input_ids": out["input_ids"]}


tok_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
tok_dataset

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    n_positions=N_POSITIONS,
    n_embd=N_EMBD,
    n_head=N_HEAD,
    n_layer=N_LAYER,
    vocab_size=len(tokenizer),
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)
model_size = sum(p.numel() for p in model.parameters() if p.requires_grad_)
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
args = TrainingArguments(
    output_dir="maccs_models",
    per_device_train_batch_size=32,
    logging_steps=100,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tok_dataset["input_ids"],
)

In [None]:
trainer.train()
trainer.save_model("maccs_models/final.pt")

In [1]:
import torch

import util

from datasets import Dataset

from tqdm import tqdm

from transformers import AutoConfig, AutoTokenizer, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("maccs_models/final.pt").to("cuda")

In [11]:
bits = " ".join(str(x) for x in torch.randint(low=0, high=2, size=(167,)).tolist())
prompt = f"{bits}\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
out = model.generate(**inputs, top_k=0, max_length=512)
print(tokenizer.decode(out[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 1 0
O)C(O)CC(O)C(O)CC)CC)CC)C)C)C)CC)C)C)C)C)C(O)C)C)C)CC)C)C(O)C)CC)C)C)CC)CC)C)C(O)C)C(O)C)C)C)C(O)C)C)C(O)C)C)C)C)C)C)C(O)C)C)C)C)C)CC)C)C(O)C(O)C)C(O)C)C)C)C(O)C)CC(O)C)C(O)C(O)C)C)C)C)C(O)C(O)C)C)C)C)C(O)C)C)C(O)C)C)C)C(O)C)C)C)C(O)C)C)C)C)C))C)C(O)C)C)CC)C(O)C(O)C)C)C)C(O)C)C)C)CC)C)C)C)C)C)C)C)C(O)CC)C(O)C(O)C)C)C)C)C)C(O)C)C)C)C)C)C)C)C)C
