In [1]:
import tensorflow as tf

def load_dataset(path, max_len=20):
    inputs = []
    targets = []
    with open(path, encoding='utf-8') as f:
        for line in f:
            inp, tgt = line.strip().split('\t')
            if len(inp) <= max_len and len(tgt) <= max_len:
                inputs.append(inp)
                targets.append(tgt)
    return inputs, targets

In [2]:
inputs, targets = load_dataset('data/zh_T9_dataset.tsv')

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer_input = Tokenizer(char_level=True, lower=False)
tokenizer_input.fit_on_texts(inputs)

tokenizer_output = Tokenizer(char_level=True, lower=False)
tokenizer_output.fit_on_texts(targets)

X = tokenizer_input.texts_to_sequences(inputs)
y = tokenizer_output.texts_to_sequences(targets)

X = pad_sequences(X, padding='post')
y = pad_sequences(y, padding='post')

In [4]:
import numpy as np
from tensorflow.keras.utils import to_categorical

vocab_size_output = len(tokenizer_output.word_index) + 1
y = to_categorical(y, num_classes=vocab_size_output)

: 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed

vocab_size_input = len(tokenizer_input.word_index) + 1
max_input_len = X.shape[1]

model = Sequential()
model.add(Embedding(input_dim=vocab_size_input, output_dim=128, input_length=max_input_len))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(TimeDistributed(Dense(vocab_size_output, activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X, y, batch_size=64, epochs=10, validation_split=0.1)

In [None]:
def predict_top_k(digit_seq, k=5):
    seq = tokenizer_input.texts_to_sequences([digit_seq])
    seq = pad_sequences(seq, maxlen=max_input_len)

    pred = model.predict(seq)[0]  # shape: (timesteps, vocab_size_output)
    
    top_k_chars_per_position = np.argsort(pred, axis=-1)[:, -k:][:, ::-1]

    from itertools import product

    candidates = []
    for comb in product(*top_k_chars_per_position):
        chars = [tokenizer_output.index_word.get(idx, '') for idx in comb if idx != 0]
        candidate = ''.join(chars).strip()
        if candidate:
            candidates.append(candidate)

    unique_candidates = list(dict.fromkeys(candidates))
    return unique_candidates[:k]

In [None]:
results = predict_top_k("4664", k=5)

for i, candidate in enumerate(results, 1):
    print(f"{i}. {candidate}")