In [1]:
from transformers import AutoTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList
from molxpt_tokenizer import MolxptTokenizer
import re
import time
from tqdm import tqdm
import sacremoses

In [2]:
molxpt_tokenizer = MolxptTokenizer.from_pretrained("molxpt_ckpt", use_fast=False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BioGptTokenizer'. 
The class this function is called from is 'MolxptTokenizer'.


In [4]:

input_str = '<start-of-mol>CC(=O)OC1=CC=CC=C1C(=O)O<end-of-mol> is in a group of medications called salicylates.'
tok = molxpt_tokenizer.tokenize(input_str)
print(tok)

['&lt;</w>', 's', 'om</w>', '&gt;</w>', 'CC</w>', '(</w>', '=</w>', 'O</w>', ')</w>', 'O', 'C1</w>', '=</w>', 'CC</w>', '=</w>', 'CC</w>', '=</w>', 'C1', 'C</w>', '(</w>', '=</w>', 'O</w>', ')</w>', 'O</w>', '&lt;</w>', 'e', 'om</w>', '&gt;</w>', 'is</w>', 'in</w>', 'a</w>', 'group</w>', 'of</w>', 'medications</w>', 'called</w>', 'salic', 'ylates</w>', '.</w>']


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("molxpt_ckpt")

In [5]:
#model = model.cuda()
model.eval()

BioGptForCausalLM(
  (biogpt): BioGptModel(
    (embed_tokens): Embedding(44536, 1024, padding_idx=1)
    (embed_positions): BioGptLearnedPositionalEmbedding(2050, 1024)
    (layers): ModuleList(
      (0-23): 24 x BioGptDecoderLayer(
        (self_attn): BioGptAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, e

In [14]:
input_ids = molxpt_tokenizer('Aspirin is', return_tensors="pt").input_ids
print(input_ids)
output = model.generate(
    input_ids,
    num_beams=4,
    max_new_tokens=300,
    num_return_sequences=4,
    return_dict_in_generate=True,
    output_scores=True,
    do_sample=False,
)

for i in range(4):
    s = molxpt_tokenizer.decode(output.sequences[i])
    print(s, output.sequences_scores[i].item())

tensor([[    2, 35357,    30]])
</s>Aspirin is associated with a reduced risk of colorectal cancer in patients with inflammatory bowel disease: a population-based cohort study. AIM: To investigate the association between aspirin use and the risk of colorectal cancer (CRC) in patients with inflammatory bowel disease (IBD). METHODS: We conducted a population-based cohort study using data from the Taiwan National Health Insurance Research Database. We identified a cohort of patients newly diagnosed with IBD between 2000 and 2010. The aspirin users were matched with non-users by age, sex, and index year. The Cox proportional hazards model was used to estimate the hazard ratios (HRs) and 95% confidence intervals (CIs) for the association between aspirin use and the risk of CRC in patients with IBD. RESULTS: We identified a total of 2608 patients with IBD, of whom 1016 were aspirin users and 1216 were non-users. During the follow-up period, we identified a total of 414 patients with CRC. The

In [13]:
input_ids = molxpt_tokenizer('Aspirin is', return_tensors="pt").input_ids
output = model.generate(
    input_ids,
    max_new_tokens=100,
    temperature=0.75,
    do_sample=True,
)
print(output)



tensor([[    2, 35357,    30,    58,   103,    23,   105,   147,     9,  2174,
           132,    15,    42,    23,   745,  2647,   113,     8,   326,    34,
         35357,    30,    22,  1520,    88,  1900,    26,    10,  1203,     9,
           960,   113,     8,   184,    11,   265,    30,  3842,    32,  7665,
            86,   176,    10,   147,     9,  2174,   132,    18,  4278,    17,
             8,  2730,    34,   197,   382,   424,  7665,    30,   103,    23,
            47,   105,   147,     9,  4278,    15,    42,    23,   745,  2647,
           113,    18,  6860,    17,     8,   175,    34,    74,   634,    22,
          1529,  1018,    56,     9,    42,    23,  6860,   194,    29,  4053,
          7665,    67,  2288,    71,    11,  2989,    13,  3077,  1218,    11,
          3509,     8,    28]])


In [14]:
s = molxpt_tokenizer.decode(output[0])
print(s)

</s>Aspirin is not associated with increased risk of colorectal cancer in patients with inflammatory bowel disease. BACKGROUND: Aspirin is a widely used medication for the prevention of cardiovascular disease. However, there is concern that aspirin may increase the risk of colorectal cancer (CRC). AIM: To determine whether aspirin is associated with an increased risk of CRC in patients with inflammatory bowel disease (IBD). METHODS: We conducted a retrospective cohort study of patients with IBD who were prescribed aspirin between January 1, 2000 and December 31, 2010. The


In [None]:
input_ids = molxpt_tokenizer('Aspirin is', return_tensors="pt").input_ids.cuda()
output = model.generate(
    input_ids,
    max_new_tokens=300,
    num_return_sequences=4,
    temperature=0.75,
    top_p=0.95,
    do_sample=True,
)

for i in range(4):
    s = molxpt_tokenizer.decode(output[i])
    print(s)


Embedding module

In [6]:
vocab_size = model.biogpt.embed_tokens.num_embeddings
d_model = model.biogpt.embed_tokens.embedding_dim
token_embeddings_weight = model.biogpt.embed_tokens.weight.data
position_embeddings_weight = model.biogpt.embed_positions.weight.data

In [7]:
import torch.nn as nn

class EmbeddingModule(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(EmbeddingModule, self).__init__()
        # 使用从预训练模型中提取的权重来初始化词嵌入层
        self.token_embeddings = nn.Embedding.from_pretrained(token_embeddings_weight, freeze=False)
        # 使用从预训练模型中提取的权重来初始化位置嵌入层
        self.position_embeddings = nn.Embedding.from_pretrained(position_embeddings_weight, freeze=False)

    def forward(self, input_ids, position_ids=None):
        tokens_embeddings = self.token_embeddings(input_ids)
        
        # 如果没有提供位置ID，则生成它们
        if position_ids is None:
            position_ids = torch.arange(input_ids.size(1), device=input_ids.device).unsqueeze(0)
        position_embeddings = self.position_embeddings(position_ids)
        
        embeddings = tokens_embeddings + position_embeddings
        return embeddings

In [11]:
import torch.nn
token_embeddings = model.biogpt.embed_tokens
positional_embeddings = model.biogpt.embed_positions
# 假设我们提取第4个Transformer层的输出
intermediate_layer = model.biogpt.layers[3]
layer_norm = model.biogpt.layer_norm

# 创建自定义嵌入模块
class EmbeddingModule_full(torch.nn.Module):
    def __init__(self, token_embeddings, positional_embeddings, intermediate_layer, layer_norm):
        super(EmbeddingModule_full, self).__init__()
        self.token_embeddings = token_embeddings
        self.positional_embeddings = positional_embeddings
        self.intermediate_layer = intermediate_layer
        self.layer_norm = layer_norm

    def forward(self, input_ids, position_ids):
        # 获取词嵌入
        token_embeddings = self.token_embeddings(input_ids)
        # 获取位置编码
        position_embeddings = self.positional_embeddings(position_ids)
        
        # 合并词嵌入和位置编码
        embeddings = token_embeddings + position_embeddings
        
        # 通过选定的Transformer中间层
        embeddings = self.intermediate_layer(embeddings)
        
        # 应用层归一化
        embeddings = self.layer_norm(embeddings)
        
        # 可以添加池化层或其他处理步骤来生成固定大小的向量
        # 例如，使用平均池化
        embeddings = torch.mean(embeddings, dim=1)
        
        return embeddings


In [12]:
# 实例化嵌入模块
# 实例化嵌入模块
embedding_module = EmbeddingModule_full(token_embeddings, positional_embeddings, intermediate_layer, layer_norm)

In [13]:
# 保存嵌入模块
import torch
torch.save(embedding_module, 'embedding_module_full.pth')