In [None]:
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from Chemprompt.data.data_loader import DataLoader
from Chemprompt.models.sklearn_model import ScikitLearnModel

In [None]:
# dataset & models
dataset_list = ["FreeSolv", "Caco2_Wang"]
model_list = [
    {"repo": "seyonec", "name": "ChemBERTa-zinc-base-v1"},
    {"repo": "ibm-research", "name": "MoLFormer-XL-both-10pct"}
]

In [None]:
loader = DataLoader()

In [None]:
def get_embeddings(smiles_list, repo, name, device="cuda:0"):
    model_id = f"{repo}/{name}"
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModel.from_pretrained(model_id, trust_remote_code=True, use_safetensors=True).to(device)
    model.eval()

    with torch.no_grad():
        inputs = tokenizer(smiles_list, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    return embeddings

In [2]:
for dataset in dataset_list:
    smiles_list, y = loader.load_dataset(dataset)

    for m in model_list:
        model_name = m["name"]
        print(f"Processing dataset={dataset}, model={model_name} ...")

        # get embeddings
        X = get_embeddings(smiles_list, m["repo"], model_name, device="cuda:0")

        # ScikitLearnModel
        save_dir = f"./result/BERT/{model_name}/{dataset}/"
        sk_model = ScikitLearnModel(model_type="regression", save_dir=save_dir)
        sk_model.fit_and_evaluate_fold(smiles_list, X, y)

(642, 2)
Processing dataset=FreeSolv, model=ChemBERTa-zinc-base-v1 ...


Some weights of RobertaModel were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to ./result/BERT/ChemBERTa-zinc-base-v1/FreeSolv/regression_Predictions.csv
Combined results saved to ./result/BERT/ChemBERTa-zinc-base-v1/FreeSolv/regression_CombinedResults.csv
Processing dataset=FreeSolv, model=MoLFormer-XL-both-10pct ...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predictions saved to ./result/BERT/MoLFormer-XL-both-10pct/FreeSolv/regression_Predictions.csv
Combined results saved to ./result/BERT/MoLFormer-XL-both-10pct/FreeSolv/regression_CombinedResults.csv
(1128, 2)
Processing dataset=ESOL, model=ChemBERTa-zinc-base-v1 ...


Some weights of RobertaModel were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to ./result/BERT/ChemBERTa-zinc-base-v1/ESOL/regression_Predictions.csv
Combined results saved to ./result/BERT/ChemBERTa-zinc-base-v1/ESOL/regression_CombinedResults.csv
Processing dataset=ESOL, model=MoLFormer-XL-both-10pct ...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predictions saved to ./result/BERT/MoLFormer-XL-both-10pct/ESOL/regression_Predictions.csv
Combined results saved to ./result/BERT/MoLFormer-XL-both-10pct/ESOL/regression_CombinedResults.csv
(1400, 2)
Processing dataset=rand_lipo, model=ChemBERTa-zinc-base-v1 ...


Some weights of RobertaModel were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to ./result/BERT/ChemBERTa-zinc-base-v1/rand_lipo/regression_Predictions.csv
Combined results saved to ./result/BERT/ChemBERTa-zinc-base-v1/rand_lipo/regression_CombinedResults.csv
Processing dataset=rand_lipo, model=MoLFormer-XL-both-10pct ...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predictions saved to ./result/BERT/MoLFormer-XL-both-10pct/rand_lipo/regression_Predictions.csv
Combined results saved to ./result/BERT/MoLFormer-XL-both-10pct/rand_lipo/regression_CombinedResults.csv
('HPPB', (1614, 2))
Processing dataset=HPPB, model=ChemBERTa-zinc-base-v1 ...


Some weights of RobertaModel were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to ./result/BERT/ChemBERTa-zinc-base-v1/HPPB/regression_Predictions.csv
Combined results saved to ./result/BERT/ChemBERTa-zinc-base-v1/HPPB/regression_CombinedResults.csv
Processing dataset=HPPB, model=MoLFormer-XL-both-10pct ...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predictions saved to ./result/BERT/MoLFormer-XL-both-10pct/HPPB/regression_Predictions.csv
Combined results saved to ./result/BERT/MoLFormer-XL-both-10pct/HPPB/regression_CombinedResults.csv
('Thermosol', (1763, 2))
Processing dataset=Thermosol, model=ChemBERTa-zinc-base-v1 ...


Some weights of RobertaModel were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to ./result/BERT/ChemBERTa-zinc-base-v1/Thermosol/regression_Predictions.csv
Combined results saved to ./result/BERT/ChemBERTa-zinc-base-v1/Thermosol/regression_CombinedResults.csv
Processing dataset=Thermosol, model=MoLFormer-XL-both-10pct ...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predictions saved to ./result/BERT/MoLFormer-XL-both-10pct/Thermosol/regression_Predictions.csv
Combined results saved to ./result/BERT/MoLFormer-XL-both-10pct/Thermosol/regression_CombinedResults.csv
