In [20]:
import pandas as pd

t9_dataset = 'data/zh_T9_dataset.tsv'
# 加载原始数据
df = pd.read_csv(t9_dataset, sep='\t', header=None, names=['code', 'pinyin', 'hanzi'])
df = df.dropna(subset=['pinyin', 'code', 'hanzi'])

# 数据增强：拼音前缀 + 数字后缀混合
augmented_inputs = []
output_hanzi = []

for code, pinyin, hanzi in zip(df['code'], df['pinyin'], df['hanzi']):
    # 添加原始数字输入
    augmented_inputs.append(code)
    output_hanzi.append(hanzi)

    # 添加拼音前缀 + 数字后缀组合
    for i in range(1, len(pinyin)):
        input_mixed = pinyin[:i] + code[i:]
        augmented_inputs.append(input_mixed)
        output_hanzi.append(hanzi)

# 构建 DataFrame
df_aug = pd.DataFrame({'input': augmented_inputs, 'output': output_hanzi})

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Tokenizers（字符级）
input_tokenizer = Tokenizer(char_level=True)
input_tokenizer.fit_on_texts(df_aug['input'])

output_tokenizer = Tokenizer(char_level=True)
output_tokenizer.fit_on_texts(df_aug['output'])

# 序列化
X_seq = input_tokenizer.texts_to_sequences(df_aug['input'])
Y_seq = output_tokenizer.texts_to_sequences(df_aug['output'])

# Padding
max_input_len = max(len(seq) for seq in X_seq)
max_output_len = max(len(seq) for seq in Y_seq)

X_pad = pad_sequences(X_seq, maxlen=max_input_len, padding='post')
Y_pad = pad_sequences(Y_seq, maxlen=max_output_len, padding='post')

# One-hot 编码输出
Y_cat = to_categorical(Y_pad, num_classes=len(output_tokenizer.word_index) + 1)

: 

In [None]:
from sklearn.model_selection import train_test_split

# 先划分训练/临时集
X_train, X_tmp, Y_train, Y_tmp = train_test_split(X_pad, Y_cat, test_size=0.2, random_state=42)

# 再从临时集中划分验证集和测试集（50/50）
X_val, X_test, Y_val, Y_test = train_test_split(X_tmp, Y_tmp, test_size=0.5, random_state=42)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense

vocab_size_in = len(input_tokenizer.word_index) + 1
vocab_size_out = len(output_tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size_in, output_dim=128, input_length=max_input_len),
    LSTM(256, return_sequences=True),
    TimeDistributed(Dense(vocab_size_out, activation='softmax'))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:

# 模型训练
history = model.fit(
    X_train, Y_train,
    batch_size=64,
    epochs=10,
    validation_data=(X_val, Y_val)
)

In [None]:
import numpy as np

# Evaluate on test set
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

In [None]:
def predict_hanzi(input_text):
    seq = input_tokenizer.texts_to_sequences([input_text])
    seq_pad = pad_sequences(seq, maxlen=max_input_len, padding='post')
    pred = model.predict(seq_pad)[0]
    pred_indices = np.argmax(pred, axis=-1)
    hanzi_pred = [output_tokenizer.index_word.get(i, '') for i in pred_indices]
    return ''.join(hanzi_pred).strip()

In [None]:
examples = ["9265426", "326", "d26", "5268"]

for ex in examples:
    print(f"Input: {ex} => Predicted: {predict_hanzi(ex)}")