In [2]:
import pandas as pd

t9_dataset = '../data/zh_T9_dataset.tsv'
# 加载原始数据
df = pd.read_csv(t9_dataset, sep='\t', header=None, names=['code', 'pinyin', 'hanzi'])
df = df.dropna(subset=['pinyin', 'code', 'hanzi'])

# 数据增强：拼音前缀 + 数字后缀混合
augmented_inputs = []
output_hanzi = []

for code, pinyin, hanzi in zip(df['code'], df['pinyin'], df['hanzi']):
    # 添加原始数字输入
    augmented_inputs.append(code)
    output_hanzi.append(hanzi)

    # 添加拼音前缀 + 数字后缀组合
    for i in range(1, len(pinyin)):
        input_mixed = pinyin[:i] + code[i:]
        augmented_inputs.append(input_mixed)
        output_hanzi.append(hanzi)

# 构建 DataFrame
df_aug = pd.DataFrame({'input': augmented_inputs, 'output': output_hanzi})

In [3]:
df_aug.head()

Unnamed: 0,input,output
0,52944,蜡纸
1,l2944,蜡纸
2,la944,蜡纸
3,laz44,蜡纸
4,lazh4,蜡纸


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Tokenizers（字符级）
input_tokenizer = Tokenizer(char_level=True)
input_tokenizer.fit_on_texts(df_aug['input'])

output_tokenizer = Tokenizer(char_level=True)
output_tokenizer.fit_on_texts(df_aug['output'])

# 序列化
X_seq = input_tokenizer.texts_to_sequences(df_aug['input'])
Y_seq = output_tokenizer.texts_to_sequences(df_aug['output'])

# Padding
max_input_len = max(len(seq) for seq in X_seq)
max_output_len = max(len(seq) for seq in Y_seq)
max_seq_len = max(max_input_len, max_output_len)

X_pad = pad_sequences(X_seq, maxlen=max_seq_len, padding='post')
Y_pad = pad_sequences(Y_seq, maxlen=max_seq_len, padding='post')

# One-hot 编码输出
# Y_cat = to_categorical(Y_pad, num_classes=len(output_tokenizer.word_index) + 1)

In [9]:
from sklearn.model_selection import train_test_split

# 先划分训练/临时集
X_train, X_tmp, Y_train, Y_tmp = train_test_split(X_pad, Y_pad, test_size=0.2, random_state=42)

# 再从临时集中划分验证集和测试集（50/50）
X_val, X_test, Y_val, Y_test = train_test_split(X_tmp, Y_tmp, test_size=0.5, random_state=42)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Train: (246907, 35), Val: (30863, 35), Test: (30864, 35)


In [10]:
Y_train.shape

(246907, 35)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense

vocab_size_in = len(input_tokenizer.word_index) + 1
vocab_size_out = len(output_tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size_in, output_dim=128, input_length=max_input_len),
    LSTM(256, return_sequences=True),
    TimeDistributed(Dense(vocab_size_out, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()




In [12]:

# 模型训练
history = model.fit(
    X_train, Y_train,
    batch_size=64,
    epochs=10,
    validation_data=(X_val, Y_val)
)

Epoch 1/10
[1m3858/3858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 127ms/step - accuracy: 0.9239 - loss: 0.7418 - val_accuracy: 0.9282 - val_loss: 0.4963
Epoch 2/10
[1m1028/3858[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m5:54[0m 125ms/step - accuracy: 0.9283 - loss: 0.4898

KeyboardInterrupt: 

In [None]:
from pathlib import Path

def save_t9_model(model, save_dir='model', model_name='t9_model'):
    save_path = Path(save_dir)
    path = save_path / model_name
    model.save(str(path))
    print(f"Model saved to：{path}")

save_t9_model(model)

In [None]:
import matplotlib.pyplot as plt

def visualize_history(history):
    """
    可视化模型训练历史（accuracy 和 loss）

    参数：
        history: model.fit() 返回的 History 对象
    """
    if not history or 'accuracy' not in history.history:
        print("❌ 无法可视化：未检测到训练历史")
        return

    acc = history.history['accuracy']
    val_acc = history.history.get('val_accuracy', [])
    loss = history.history['loss']
    val_loss = history.history.get('val_loss', [])

    epochs = range(1, len(acc) + 1)

    # 准确率图
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, label='Train Accuracy')
    if val_acc:
        plt.plot(epochs, val_acc, label='Val Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    # 损失图
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, label='Train Loss')
    if val_loss:
        plt.plot(epochs, val_loss, label='Val Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
import numpy as np

# Evaluate on test set
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

In [None]:
def predict_hanzi(input_text):
    seq = input_tokenizer.texts_to_sequences([input_text])
    seq_pad = pad_sequences(seq, maxlen=max_input_len, padding='post')
    pred = model.predict(seq_pad)[0]
    pred_indices = np.argmax(pred, axis=-1)
    hanzi_pred = [output_tokenizer.index_word.get(i, '') for i in pred_indices]
    return ''.join(hanzi_pred).strip()

In [None]:
examples = ["9265426", "326", "d26", "5268"]

for ex in examples:
    print(f"Input: {ex} => Predicted: {predict_hanzi(ex)}")