In [1]:
import torch
from models.miniTransformer import generate_square_subsequent_mask
from models.miniTransformerV2 import TransformerChat
import myTokenizer



In [2]:
from myTokenizer import myTokenizer
ThisTokenizer = myTokenizer(num_words=15000)
# ThisTokenizer.train_from_parquet(parquet_path='healthCare.parquet', inputCol='input', outputCol='output')
# ThisTokenizer.save_tokenizer('/tokenizer/tokenizerForHealthCare.pkl')

TOKENIZER = ThisTokenizer.load_tokenizer('/tokenizer/tokenizerForHealthCare.pkl')
print("tokenizer done, with length", len(TOKENIZER.word_index) + 1)
print("vocab size:", TOKENIZER.num_words)

# word → index
word2idx = TOKENIZER.word_index

# index → word
idx2word = {idx: word for word, idx in word2idx.items()}

# 注意：word_index 不会自动添加 <pad>，如果你在训练时加了 pad_idx=0，要手动加：
word2idx["<pad>"] = 0
idx2word[0] = "<pad>"

# src_vocab 和 trg_vocab 就是这个 word2idx（如果是共享词表的话）
src_vocab = word2idx
trg_vocab = word2idx

✅ Tokenizer is loaded successfully: /tokenizer/tokenizerForHealthCare.pkl
tokenizer done, with length 41978
vocab size: 15000


In [3]:
from models.BiLSTM import EncoderBiLSTM, DecoderLSTM, Seq2Seq, Attention

# 假设你已经知道下面这些参数：
vocab_size = TOKENIZER.num_words + 1  
embedding_dim = 256
hidden_dim = 512
pad_idx = 0
output_dim = vocab_size  # 生成任务，输出词表大小和输入相同
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

attn = Attention(hidden_dim)
encoder = EncoderBiLSTM(vocab_size, embedding_dim, hidden_dim, pad_idx)
decoder = DecoderLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, attn)
model = Seq2Seq(encoder, decoder, pad_idx=pad_idx, device=DEVICE).to(DEVICE)


In [4]:
# 加载保存的参数
model.load_state_dict(torch.load("checkpoint/weight_biLSTM_100.pth", map_location=DEVICE))
model.eval()


Seq2Seq(
  (encoder): EncoderBiLSTM(
    (embedding): Embedding(15001, 256, padding_idx=0)
    (lstm): LSTM(256, 512, batch_first=True, bidirectional=True)
  )
  (decoder): DecoderLSTM(
    (embedding): Embedding(15001, 256, padding_idx=0)
    (lstm): LSTM(1280, 512, batch_first=True)
    (fc_out): Linear(in_features=1536, out_features=15001, bias=True)
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
  )
)

In [5]:
def translate_sentence(model, src_tensor, src_vocab, trg_vocab, device, max_len=50):
    model.eval()

    src_tensor = src_tensor.unsqueeze(0).to(device)  # [1, src_len]
    mask = model.create_mask(src_tensor)

    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src_tensor)

        # 处理 hidden, cell 初始化
        hidden = torch.tanh(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = torch.tanh(torch.cat((cell[0:1], cell[1:2]), dim=2))
        hidden = hidden[:, :, :model.decoder.decoder_hidden_dim]
        cell = cell[:, :, :model.decoder.decoder_hidden_dim]

    # 第一个 decoder 输入是 <sos>
    trg_indices = [trg_vocab['<start>']]

    for _ in range(max_len):
        prev_input = torch.LongTensor([trg_indices[-1]]).to(device)  # [1]
        
        with torch.no_grad():
            output, hidden, cell, _ = model.decoder(prev_input, hidden, cell, encoder_outputs, mask)

        next_token = output.argmax(1).item()
        trg_indices.append(next_token)

        if next_token == trg_vocab['<end>']:
            break

    # 去除 <sos> 和 <eos>
    return trg_indices[1:-1]


In [6]:
sentence = "i love machine learning"
src_tokens = [src_vocab.get(tok, src_vocab['<UNKNOWN>']) for tok in sentence.split()]
src_tensor = torch.LongTensor(src_tokens)

output_ids = translate_sentence(model, src_tensor, src_vocab, trg_vocab, DEVICE)
output_words = [idx2word[idx] for idx in output_ids]

print(" ".join(output_words))


, thank you for chat doctor . i have gone your query . understand your . in my opinion you should start . . . . is not to be out . . . of these symptoms to you to with of a . . to is family for
