In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict, Counter


2025-05-30 12:41:32.756836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748601692.787644    2487 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748601692.791346    2487 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-30 12:41:32.808146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
sent_data_path = 'data/zh_sent_dataset.tsv'
t9_data_path = 'data/zh_T9_dataset.tsv'

sentences = pd.read_csv(sent_data_path, sep="\t", header=None, names=["sentence"])
codes = pd.read_csv(t9_data_path, sep="\t", header=None, names=["code", "char"])
codes = codes.drop_duplicates(subset=["code", "char"]).reset_index(drop=True)

# Build Nine-Key Code Mappings
code2chars = defaultdict(list)
char2code = {}

for _, row in codes.iterrows():
    code2chars[row.code].append(row.char)
    char2code[row.char] = row.code

# Build Training Samples
# For each character in sentence, use previous text as context and current code as input

samples = []

window_size = 20  # Limit context length (in characters)

for sentence in sentences["sentence"]:
    sentence = re.sub(r"[^\u4e00-\u9fa5]", "", sentence)  # Remove non-Chinese characters
    for i in range(len(sentence)):
        char = sentence[i]
        code = char2code.get(char)
        if code is None:
            continue
        context = sentence[max(0, i - window_size):i]
        samples.append((context, code, char))

print(f"Total samples: {len(samples)}")


Total samples: 1301608


In [4]:

# Build Vocabulary & Vectorization
all_chars = sorted(set(char2code.keys()))
char2idx = {c: i + 1 for i, c in enumerate(all_chars)}  # 0 用作 padding
idx2char = {i: c for c, i in char2idx.items()}

code_set = sorted(code2chars.keys())
code2idx = {c: i + 1 for i, c in enumerate(code_set)}  # 0 为 padding

max_context_len = window_size

def encode_context(text):
    return [char2idx.get(c, 0) for c in text][-max_context_len:]

def encode_code(code):
    return code2idx.get(code, 0)

X_context = []
X_code = []
Y_char = []

for ctx, code, char in samples:
    X_context.append(encode_context(ctx))
    X_code.append(encode_code(code))
    Y_char.append(char2idx[char])

# Padding
X_context = keras.preprocessing.sequence.pad_sequences(X_context, maxlen=max_context_len, padding='pre')
X_code = np.array(X_code)
Y_char = np.array(Y_char)

X_train_ctx, X_temp_ctx, X_train_code, X_temp_code, y_train, y_temp = train_test_split(
    X_context, X_code, Y_char, test_size=0.2, random_state=42)

X_val_ctx, X_test_ctx, X_val_code, X_test_code, y_val, y_test = train_test_split(
    X_temp_ctx, X_temp_code, y_temp, test_size=0.5, random_state=42)


In [5]:
vocab_size = len(char2idx) + 1
code_vocab_size = len(code2idx) + 1
embedding_dim = 64

ctx_input = keras.Input(shape=(max_context_len,), name="context_input")
ctx_emb = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=False)(ctx_input)
ctx_emb = layers.Masking(mask_value=0.0)(ctx_emb)
ctx_encoded = layers.Bidirectional(
layers.LSTM(64, recurrent_activation="sigmoid")
)(ctx_emb)

code_input = keras.Input(shape=(), dtype=tf.int32, name="code_input")
code_emb = layers.Embedding(input_dim=code_vocab_size, output_dim=32)(code_input)
code_encoded = layers.Flatten()(code_emb)

merged = layers.concatenate([ctx_encoded, code_encoded])
hidden = layers.Dense(128, activation="relu")(merged)
output = layers.Dense(vocab_size, activation="softmax")(hidden)

model = keras.Model(inputs=[ctx_input, code_input], outputs=output)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()


I0000 00:00:1748601782.562044    2487 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5582 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [6]:
callbacks = [
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
]

history = model.fit(
    {"context_input": X_train_ctx, "code_input": X_train_code},
    y_train,
    validation_data=(
        {"context_input": X_val_ctx, "code_input": X_val_code},
        y_val
    ),
    epochs=15,
    batch_size=256,
    callbacks=callbacks
)


Epoch 1/15


2025-05-30 12:43:10.705438: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 83302880 exceeds 10% of free system memory.
2025-05-30 12:43:11.633056: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.
I0000 00:00:1748601793.009223    2931 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 35ms/step - accuracy: 0.3309 - loss: 3.6645 - val_accuracy: 0.5403 - val_loss: 1.6312
Epoch 2/15


2025-05-30 12:45:35.212821: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.
2025-05-30 12:45:35.278099: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 35ms/step - accuracy: 0.5613 - loss: 1.5434 - val_accuracy: 0.5973 - val_loss: 1.4175
Epoch 3/15


2025-05-30 12:48:55.704249: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22361600 exceeds 10% of free system memory.


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 33ms/step - accuracy: 0.6111 - loss: 1.3468 - val_accuracy: 0.6247 - val_loss: 1.3209
Epoch 4/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 35ms/step - accuracy: 0.6393 - loss: 1.2332 - val_accuracy: 0.6411 - val_loss: 1.2708
Epoch 5/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 35ms/step - accuracy: 0.6566 - loss: 1.1625 - val_accuracy: 0.6507 - val_loss: 1.2422
Epoch 6/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 33ms/step - accuracy: 0.6704 - loss: 1.1091 - val_accuracy: 0.6602 - val_loss: 1.2245
Epoch 7/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 33ms/step - accuracy: 0.6830 - loss: 1.0632 - val_accuracy: 0.6654 - val_loss: 1.2046
Epoch 8/15
[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 35ms/step - accuracy: 0.6929 - loss: 1.0251 - val_accuracy: 0.6704 - val_loss: 1.1973
Epoch 9/1

In [7]:
test_loss, test_acc = model.evaluate(
    {"context_input": X_test_ctx, "code_input": X_test_code},
    y_test
)
print(f"Test accuracy: {test_acc:.4f}")


[1m4068/4068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 11ms/step - accuracy: 0.6744 - loss: 1.1950
Test accuracy: 0.6739


In [8]:
def predict_next(code=None, context="", topk=5):
    ctx_enc = encode_context(context)
    code_enc = encode_code(code) if code else 0
    ctx_pad = keras.preprocessing.sequence.pad_sequences([ctx_enc], maxlen=max_context_len)
    pred = model.predict({"context_input": ctx_pad, "code_input": np.array([code_enc])}, verbose=0)[0]
    if code:
        possible_chars = code2chars[code]
        possible_ids = [char2idx[c] for c in possible_chars if c in char2idx]
        filtered = [(i, pred[i]) for i in possible_ids]
    else:
        filtered = list(enumerate(pred))

    filtered = sorted(filtered, key=lambda x: x[1], reverse=True)
    return [(idx2char[i], score) for i, score in filtered[:topk] if i in idx2char]


In [9]:
print("Input code=7426, context='我想知道'")
print(predict_next(code='7426', context='我想知道'))

print("Input code='', context='价格'")
print(predict_next(code='', context='价格'))

print("Input code='2878', context=''")
print(predict_next(code='2878', context=''))

Input code=7426, context='我想知道'
[('少', 0.24244513), ('少', 0.24244513), ('少', 0.24244513), ('少', 0.24244513), ('少', 0.24244513)]
Input code='', context='价格'
[('有', 0.046795007), ('是', 0.045057654), ('每', 0.04019928), ('内', 0.039380517), ('价', 0.038499665)]
Input code='2878', context=''
[('粗俗', 1.0067982e-05), ('不如', 9.943495e-06), ('不如', 9.943495e-06), ('不如', 9.943495e-06), ('不如', 9.943495e-06)]


In [11]:
# Rebuild the model on CPU to avoid CudnnRNN ops
with tf.device('/CPU:0'):
    ctx_input = keras.Input(shape=(max_context_len,), name="context_input")
    ctx_emb = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=False)(ctx_input)
    ctx_emb = layers.Masking(mask_value=0.0)(ctx_emb)
    ctx_encoded = layers.Bidirectional(
        layers.LSTM(64, recurrent_activation="sigmoid", unroll=True)
    )(ctx_emb)

    code_input = keras.Input(shape=(), dtype=tf.int32, name="code_input")
    code_emb = layers.Embedding(input_dim=code_vocab_size, output_dim=32)(code_input)
    code_encoded = layers.Flatten()(code_emb)

    merged = layers.concatenate([ctx_encoded, code_encoded])
    hidden = layers.Dense(128, activation="relu")(merged)
    output = layers.Dense(vocab_size, activation="softmax")(hidden)

    cpu_model = keras.Model(inputs=[ctx_input, code_input], outputs=output)
    cpu_model.set_weights(model.get_weights())

converter = tf.lite.TFLiteConverter.from_keras_model(cpu_model)
tflite_model = converter.convert()

with open("model/T9_predictor.tflite", "wb") as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: /tmp/tmpm_dg0772/assets


INFO:tensorflow:Assets written to: /tmp/tmpm_dg0772/assets


Saved artifact at '/tmp/tmpm_dg0772'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 20), dtype=tf.float32, name='context_input'), TensorSpec(shape=(None,), dtype=tf.int32, name='code_input')]
Output Type:
  TensorSpec(shape=(None, 43675), dtype=tf.float32, name=None)
Captures:
  140087889398288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889400016: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889398672: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889401168: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889401744: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889400784: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889402320: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889402896: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889403088: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140087889403664: Ten

W0000 00:00:1748603886.189156    2487 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1748603886.189186    2487 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-05-30 13:18:06.189343: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpm_dg0772
2025-05-30 13:18:06.190556: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-05-30 13:18:06.190566: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpm_dg0772
2025-05-30 13:18:06.209078: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-05-30 13:18:06.297441: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpm_dg0772
2025-05-30 13:18:06.328585: I tensorflow/cc/saved_model/loader.cc:466] SavedModel load for tags { serve }; Status: success: OK. Took 139244 microseconds.
