In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict, Counter


2025-05-31 00:08:12.198991: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748642892.257838    1355 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748642892.274469    1355 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-31 00:08:12.417479: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
sent_data_path = 'data/zh_sent_dataset.tsv'
t9_data_path = 'data/zh_T9_dataset.tsv'

sentences = pd.read_csv(sent_data_path, sep="\t", header=None, names=["sentence"])
codes = pd.read_csv(t9_data_path, sep="\t", header=None, names=["code", "char"])
codes = codes.drop_duplicates(subset=["code", "char"]).reset_index(drop=True)

# Build Nine-Key Code Mappings
code2chars = defaultdict(list)
char2code = {}

for _, row in codes.iterrows():
    code2chars[row.code].append(row.char)
    char2code[row.char] = row.code

# Build Training Samples
# For each character in sentence, use previous text as context and current code as input

samples = []

window_size = 20  # Limit context length (in characters)

for sentence in sentences["sentence"]:
    sentence = re.sub(r"[^\u4e00-\u9fa5]", "", sentence)  # Remove non-Chinese characters
    for i in range(len(sentence)):
        char = sentence[i]
        code = char2code.get(char)
        if code is None:
            continue
        context = sentence[max(0, i - window_size):i]
        samples.append((context, code, char))

print(f"Total samples: {len(samples)}")


Total samples: 1301608


In [4]:
# Build Vocabulary & Vectorization
all_chars = sorted(set(char2code.keys()))
char2idx = {c: i + 1 for i, c in enumerate(all_chars)}  # 0 用作 padding
idx2char = {i: c for c, i in char2idx.items()}

code_set = sorted(code2chars.keys())
code2idx = {c: i + 1 for i, c in enumerate(code_set)}  # 0 为 padding

max_context_len = window_size

def encode_context(text):
    return [char2idx.get(c, 0) for c in text][-max_context_len:]

def encode_code(code):
    return code2idx.get(code, 0)

X_context = []
X_code = []
Y_char = []

for ctx, code, char in samples:
    X_context.append(encode_context(ctx))
    X_code.append(encode_code(code))
    Y_char.append(char2idx[char])

# Padding
X_context = keras.preprocessing.sequence.pad_sequences(X_context, maxlen=max_context_len, padding='pre')
X_code = np.array(X_code)
Y_char = np.array(Y_char)

X_train_ctx, X_temp_ctx, X_train_code, X_temp_code, y_train, y_temp = train_test_split(
    X_context, X_code, Y_char, test_size=0.2, random_state=42)

X_val_ctx, X_test_ctx, X_val_code, X_test_code, y_val, y_test = train_test_split(
    X_temp_ctx, X_temp_code, y_temp, test_size=0.5, random_state=42)


In [5]:
vocab_size = len(char2idx) + 1
code_vocab_size = len(code2idx) + 1
embedding_dim = 64

ctx_input = keras.Input(shape=(max_context_len,), name="context_input")
ctx_emb = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=False)(ctx_input)
ctx_emb = layers.Masking(mask_value=0.0)(ctx_emb)
ctx_encoded = layers.Bidirectional(
layers.LSTM(64, recurrent_activation="sigmoid")
)(ctx_emb)

code_input = keras.Input(shape=(), dtype=tf.int32, name="code_input")
code_emb = layers.Embedding(input_dim=code_vocab_size, output_dim=32)(code_input)
code_encoded = layers.Flatten()(code_emb)

merged = layers.concatenate([ctx_encoded, code_encoded])
hidden = layers.Dense(128, activation="relu")(merged)
output = layers.Dense(vocab_size, activation="softmax")(hidden)

model = keras.Model(inputs=[ctx_input, code_input], outputs=output)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()


I0000 00:00:1748642915.842327    1355 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5582 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [6]:
callbacks = [
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
]

history = model.fit(
    {"context_input": X_train_ctx, "code_input": X_train_code},
    y_train,
    validation_data=(
        {"context_input": X_val_ctx, "code_input": X_val_code},
        y_val
    ),
    epochs=15,
    batch_size=256,
    callbacks=callbacks
)


Epoch 1/15


2025-05-31 00:08:47.597920: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 83302880 exceeds 10% of free system memory.
2025-05-31 00:08:48.538322: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.
I0000 00:00:1748642929.901229    1834 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 33ms/step - accuracy: 0.3321 - loss: 3.6369 - val_accuracy: 0.5464 - val_loss: 1.6130
Epoch 2/15


2025-05-31 00:11:04.031649: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.
2025-05-31 00:11:04.061681: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 35ms/step - accuracy: 0.5634 - loss: 1.5342 - val_accuracy: 0.5951 - val_loss: 1.4220
Epoch 3/15


2025-05-31 00:13:32.404595: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 35ms/step - accuracy: 0.6119 - loss: 1.3428 - val_accuracy: 0.6256 - val_loss: 1.3187
Epoch 4/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 35ms/step - accuracy: 0.6399 - loss: 1.2317 - val_accuracy: 0.6405 - val_loss: 1.2700
Epoch 5/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 33ms/step - accuracy: 0.6573 - loss: 1.1585 - val_accuracy: 0.6529 - val_loss: 1.2374
Epoch 6/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 35ms/step - accuracy: 0.6725 - loss: 1.1031 - val_accuracy: 0.6600 - val_loss: 1.2178
Epoch 7/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 35ms/step - accuracy: 0.6834 - loss: 1.0591 - val_accuracy: 0.6663 - val_loss: 1.1981
Epoch 8/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 32ms/step - accuracy: 0.6926 - loss: 1.0219 - val_accuracy: 0.6705 - val_loss: 1.1983
Epoch 9/1

In [7]:
test_loss, test_acc = model.evaluate(
    {"context_input": X_test_ctx, "code_input": X_test_code},
    y_test
)
print(f"Test accuracy: {test_acc:.4f}")


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 11ms/step - accuracy: 0.6777 - loss: 1.1839
Test accuracy: 0.6769


In [None]:
def predict_next(code=None, context="", topk=5, model=model):
    ctx_enc = encode_context(context)
    code_enc = encode_code(code) if code else 0
    ctx_pad = keras.preprocessing.sequence.pad_sequences([ctx_enc], maxlen=max_context_len)
    pred = model.predict({"context_input": ctx_pad, "code_input": np.array([code_enc])}, verbose=0)[0]
    if code:
        possible_chars = list(dict.fromkeys(code2chars[code]))
        possible_ids = [char2idx[c] for c in possible_chars if c in char2idx]
        filtered = [(i, pred[i]) for i in possible_ids]
    else:
        filtered = list(enumerate(pred))

    filtered = sorted(filtered, key=lambda x: x[1], reverse=True)
    return [(idx2char[i], score) for i, score in filtered[:topk] if i in idx2char]


In [9]:
print("Input code=7426, context='我想知道'")
print(predict_next(code='7426', context='我想知道'))

print("Input code='', context='价格'")
print(predict_next(code='', context='价格'))

print("Input code='2878', context=''")
print(predict_next(code='2878', context=''))

Input code=7426, context='我想知道'
[('歉', 0.5661846), ('前', 0.10014085), ('少', 0.09429764), ('乔', 0.06105688), ('瞧', 0.04781567)]
Input code='', context='价格'
[('都', 0.038976934), ('催', 0.026197514), ('或', 0.022033794), ('时', 0.021230502), ('是', 0.017466092)]
Input code='2878', context=''
[('不如', 1.2638449e-05), ('不屈', 1.2382022e-05), ('不俗', 1.2139335e-05), ('粗俗', 1.20106615e-05), ('步入', 1.1948511e-05)]


In [10]:
model.save("model/t9_predictor_model.keras")

def load_model(model_path):
    return keras.models.load_model(model_path)

In [13]:
model = load_model("model/t9_predictor_model.keras")
print(predict_next(code='287', context=''))

[]
