In [13]:
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd

In [14]:
model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, resume_download=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True ,resume_download=True)



In [17]:
train_hi = pd.read_csv('~/data/splits/hi/train.tsv', sep='\t')
valid_hi = pd.read_csv('~/data/splits/hi/valid.tsv', sep='\t')
train = pd.read_csv('~/data/train.tsv', sep='\t')
test = pd.read_csv('~/data/test.tsv', sep='\t')
train_internal_external = pd.read_csv('~/data/internal_external.tsv', sep='\t')

In [18]:
enc = tokenizer(train.iloc[:3]['smiles'].tolist(), padding=True, return_tensors="pt")['input_ids'][0]
print(tokenizer.convert_ids_to_tokens(enc))

['<bos>', 'C', 'C', 'N', 'c', '1', 'c', 'c', 'n', 'c', '(', 'N', '(', 'C', ')', 'C', 'c', '2', 'n', 'c', '3', 'c', 'c', 'c', 'c', 'c', '3', 'n', '2', 'C', ')', 'n', '1', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [24]:
tokenizer.pad_token_id

2

In [19]:
BATCH_SIZE = 64

def infer(model, smiles):
    all_embeddings = []

    for i in range(0, len(smiles), BATCH_SIZE):
        batch = smiles[i:i+BATCH_SIZE]
        inputs = tokenizer(batch, padding=True, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            attention_mask = inputs['attention_mask']
            outputs = model(**inputs)
            embeddings = outputs['last_hidden_state']
            embeddings = (embeddings * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            all_embeddings.append(embeddings.cpu().numpy())
    
    return np.concatenate(all_embeddings)

In [20]:
model.eval()
model = model.to("cuda")

In [21]:
train_hi_embeddings = infer(model, train_hi['smiles'].tolist())
valid_hi_embeddings = infer(model, valid_hi['smiles'].tolist())
train_embeddings = infer(model, train['smiles'].tolist())
test_embeddings = infer(model, test['smiles'].tolist())
train_internal_external_embeddings = infer(model, train_internal_external['smiles'].tolist())

In [22]:
train_hi_embeddings.shape, valid_hi_embeddings.shape, train_embeddings.shape, test_embeddings.shape, train_internal_external_embeddings.shape

((1267, 768), (308, 768), (1578, 768), (400, 768), (10400, 768))

In [23]:
np.save('/home/ubuntu/data/splits/hi/train_molformer_xl_both_10pct_embeds.npy', train_hi_embeddings)
np.save('/home/ubuntu/data/splits/hi/valid_molformer_xl_both_10pct_embeds.npy', valid_hi_embeddings)
np.save('/home/ubuntu/data/train_molformer_xl_both_10pct_embeds.npy', train_embeddings)
np.save('/home/ubuntu/data/test_molformer_xl_both_10pct_embeds.npy', test_embeddings)
np.save('/home/ubuntu/data/internal_external_molformer_xl_both_10pct_embeds.npy', train_internal_external_embeddings)