In [1]:
from datasets import load_dataset

dataset = load_dataset("Thecoder3281f/MIT_separated", "normal")

In [None]:
dataset

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast




In [None]:
# Build vocab from your space-tokenized SMILES
def build_vocab_from_dataset(dataset, fields=["input", "target"]):
    vocab = set()
    splits = ["train", "val", "test"]
    for split in splits:
        for ex in dataset[split]:
            for f in fields:
                vocab.update(ex[f].split())

    return vocab

vocab = build_vocab_from_dataset(dataset)
vocab.update(["[PAD]", "[UNK]", "<s>", "</s>"])

#print(vocab)
vocab = {tok: i for i, tok in enumerate(sorted(vocab), start=0)}

#print(vocab)

# Create WordLevel tokenizer
tok = Tokenizer(WordLevel(vocab=vocab, unk_token="[UNK]"))
tok.pre_tokenizer = Whitespace()

# Wrap as a Hugging Face tokenizer
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tok,
    unk_token="[UNK]",
    pad_token="[PAD]",
    eos_token="</s>",
    bos_token="<s>",
)

hf_tokenizer.save_pretrained("smiles-whitespace-tokenizer-separated-mit")

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("smiles-whitespace-tokenizer-separated-mit")

def preprocess(batch):
    model_inputs = tokenizer(
        batch["input"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target"],
            padding="max_length",
            truncation=True,
            max_length=256,
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=["input", "target"])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 409035
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 40000
    })
    val: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 30000
    })
})

In [None]:
tokenized_datasets["train"][0]["attention_mask"]  # Example of tokenized input

In [4]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


CUDA available: True
Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 2060


In [21]:
import logging
import numpy as np

# set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

def canonicalize(smiles):
    """Join tokens, parse to molecule, return canonical SMILES or None."""
    try:
        s = smiles.replace(" ", "")
        mol = Chem.MolFromSmiles(s)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, canonical=True)
    except Exception:
        return None

def tanimoto(a, b):
    """Compute Tanimoto similarity between two SMILES."""
    try:
        ma, mb = Chem.MolFromSmiles(a), Chem.MolFromSmiles(b)
        if not ma or not mb:
            return 0
        fa = AllChem.GetMorganFingerprintAsBitVect(ma, 2)
        fb = AllChem.GetMorganFingerprintAsBitVect(mb, 2)
        return DataStructs.TanimotoSimilarity(fa, fb)
    except Exception:
        return 0

def compute_metrics(eval_pred, tokenizer):
    preds, labels = eval_pred

    # handle tuple
    if isinstance(preds, tuple):
        preds = preds[0]

    # convert logits to token IDs if needed
    preds = np.array(preds)
    if preds.ndim == 3:  # (batch, seq_len, vocab_size)
        preds = np.argmax(preds, axis=-1)

    labels = np.array(labels)


    # ðŸ§  handle both top-1 and top-k outputs
    if preds.ndim == 3:  # (batch, k, seq_len)
        batch_size, k, seq_len = preds.shape
        preds = preds.reshape(batch_size * k, seq_len)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_preds = [decoded_preds[i * k:(i + 1) * k] for i in range(batch_size)]
    else:  # (batch, seq_len)
        decoded_preds = [[tokenizer.decode(p, skip_special_tokens=True)] for p in preds]

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    top1_correct = 0
    tanimotos = []
    valids = []

    for k_preds, label in zip(decoded_preds, decoded_labels):
        label_c = canonicalize(label)
        

        if label_c is None:
            tanimotos.append(0)
            valids.append(0)
            continue
        valids.append(bool(Chem.MolFromSmiles(label_c)))
        best_tani = 0

        for i, pred in enumerate(k_preds):
            p_c = canonicalize(pred)
            if p_c is None:
                continue

            tani = tanimoto(p_c, label_c)
            best_tani = max(best_tani, tani)

            if p_c == label_c:
                if i == 0:
                    top1_correct += 1
                break

        tanimotos.append(best_tani)

    canonical_top1 = top1_correct / len(decoded_labels)
    mean_tanimoto = sum(tanimotos) / len(tanimotos)
    validity = sum(valids) / len(valids)

    logger.info(f"Canonical Top-1 Accuracy: {canonical_top1:.3f}")
    logger.info(f"Mean Tanimoto: {mean_tanimoto:.3f}")
    logger.info(f"Validity: {validity:.3f}")

    return {
        "canonical_top1": canonical_top1,
        "mean_tanimoto": mean_tanimoto,
        "validity": validity,
    }


In [6]:
train_dataset_small = tokenized_datasets["train"].shuffle(seed=42).select(range(40000))
val_dataset_small = tokenized_datasets["val"].shuffle(seed=42).select(range(3000))
test_dataset_small = tokenized_datasets["test"].shuffle(seed=42).select(range(3000))

train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["val"]
test_dataset = tokenized_datasets["test"]

In [7]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 409035
})

In [8]:
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration

In [22]:


def model_init():
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model.resize_token_embeddings(len(tokenizer))
    return model

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1), labels

args = TrainingArguments(
    output_dir="t5-small-mit-separated",
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=1e-4,
    # per_device_train_batch_size=64,
    auto_find_batch_size=True,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    max_steps=10,
    save_steps=10,
    eval_steps=10,
    # num_train_epochs=1,
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard",
    weight_decay=0.05,
    logging_dir="./logs",
    greater_is_better=True,
    metric_for_best_model="canonical_top1",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    eval_accumulation_steps=128,
    fp16=True,
)

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer),
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

trainer.train(resume_from_checkpoint=False)


# best_trial = trainer.hyperparameter_search(
#     direction="minimize",
#     backend="optuna",
#     n_trials=5,
#     hp_space=lambda trial: {
#         "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 1e-4),
#         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
#         "weight_decay": trial.suggest_float("weight_decay", 0, 0.3),
#     },
# )

# best_trial

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 14, 'bos_token_id': 15, 'pad_token_id': 179}.


Step,Training Loss,Validation Loss,Canonical Top1,Mean Tanimoto,Validity
10,No log,0.462448,0.0,0.0,0.91


[23:10:37] SMILES Parse Error: syntax error while parsing: -))1ccc=77=Oc7I)>7ccc>
[23:10:37] SMILES Parse Error: check for mistakes around position 1:
[23:10:37] -))1ccc=77=Oc7I)>7ccc>
[23:10:37] ^
[23:10:37] SMILES Parse Error: Failed parsing SMILES '-))1ccc=77=Oc7I)>7ccc>' for input: '-))1ccc=77=Oc7I)>7ccc>'
[23:10:37] SMILES Parse Error: syntax error while parsing: -)CccccC)c1cc7ccccccc1cnccc-)c1cc1cccccc>C
[23:10:37] SMILES Parse Error: check for mistakes around position 1:
[23:10:37] -)CccccC)c1cc7ccccccc1cnccc-)c1cc1cccccc>
[23:10:37] ^
[23:10:37] SMILES Parse Error: Failed parsing SMILES '-)CccccC)c1cc7ccccccc1cnccc-)c1cc1cccccc>C' for input: '-)CccccC)c1cc7ccccccc1cnccc-)c1cc1cccccc>C'
[23:10:37] SMILES Parse Error: syntax error while parsing: -C3>cccC27cC2CCCO)C)11C2332CCCC3CC33)Cc3>7
[23:10:37] SMILES Parse Error: check for mistakes around position 1:
[23:10:37] -C3>cccC27cC2CCCO)C)11C2332CCCC3CC33)Cc3>
[23:10:37] ^
[23:10:37] SMILES Parse Error: Failed parsing SMILES '-C3>cc

TrainOutput(global_step=10, training_loss=4.328192901611328, metrics={'train_runtime': 54.072, 'train_samples_per_second': 1.48, 'train_steps_per_second': 0.185, 'total_flos': 24358315622400.0, 'train_loss': 4.328192901611328, 'epoch': 0.00019557989438685703})

In [None]:
metrics = trainer.evaluate(test_dataset_small)
print(metrics)


In [None]:
trainer.save_model("t5-mit-small-dataset-separated-lr1e-5-wd0.05-5000steps")

In [18]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model_4bit = T5ForConditionalGeneration.from_pretrained(
    "t5-base-mit-separated/checkpoint-750",
    quantization_config=bnb_config,
    device_map="auto",
).to("cuda")

text = "C O c 1 c c c c 2 c 1 C ( C ) C ( = O ) N 2 C . N # C C I > C C O . C C [O-] . [Na+]"

inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False).to(model_4bit.device)
outputs_4bit = model_4bit.generate(**inputs, max_length=256, repetition_penalty=1.0, do_sample=False, num_return_sequences=2, num_beams=10, output_scores=True, return_dict_in_generate=True)

# print(outputs_4bit)


import torch

seq_scores = outputs_4bit.sequences_scores  # logits for each sequence
probs = torch.softmax(seq_scores, dim=0) * 100  # %
print("Scores: ", probs)

preds_4bit = tokenizer.batch_decode(outputs_4bit.sequences, skip_special_tokens=True)
for i, p in enumerate(preds_4bit):
    print(f"4-bit Model Prediction {i+1}: {p}")

Scores:  tensor([55.5501, 44.4499], device='cuda:0')
4-bit Model Prediction 1: # C O c 1 c c c c 2 c 1 C ( ( (
4-bit Model Prediction 2: # N # C O c 1 c c c c 2 c 1 C (


In [None]:


model = T5ForConditionalGeneration.from_pretrained("t5-base-mit-separated/checkpoint-100").to("cuda")

model2 = T5ForConditionalGeneration.from_pretrained("t5-small-mit-separated/checkpoint-26000").to("cuda")

model_base = T5ForConditionalGeneration.from_pretrained("t5-base").to("cuda")
model_small = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")


inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False).to(model.device)


outputs = model.generate(**inputs, max_length=256, repetition_penalty=1, do_sample=False, num_return_sequences=2, num_beams=10)
preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for i, p in enumerate(preds):
    print(f"Model 1 Prediction {i+1}: {p}")


outputs2 = model2.generate(**inputs, max_length=256, repetition_penalty=1.0, do_sample=False, num_return_sequences=2, num_beams=10)
preds2 = tokenizer.batch_decode(outputs2, skip_special_tokens=True)
for i, p in enumerate(preds2):
    print(f"Model 2 Prediction {i+1}: {p}")


# outputs_base = model_base.generate(**inputs, max_length=256, repetition_penalty=1.0, do_sample=False, num_return_sequences=2, num_beams=10)
# preds_base = tokenizer.batch_decode(outputs_base, skip_special_tokens=True)
# for i, p in enumerate(preds_base):
#     print(f"Base Model Prediction {i+1}: {p}")
# outputs_small = model_small.generate(**inputs, max_length=256, repetition_penalty=1.0, do_sample=False, num_return_sequences=2, num_beams=10)
# preds_small = tokenizer.batch_decode(outputs_small, skip_special_tokens=True)
# for i, p in enumerate(preds_small):
#     print(f"Small Model Prediction {i+1}: {p}")



In [None]:
model.save_pretrained("t5-mit-small-dataset-separated-lr1e-5-wd0.05-5000steps")