In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re

In [10]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().split('\n')
    data = [line.split('\t') for line in lines if line]
    return data

# 加载数据
file_path = 'deu.txt'  # 替换为你的文件路径
data = load_data(file_path)

# 提取德语句子和英语句子
de_sentences = [pair[1] for pair in data]
en_sentences = [pair[0] for pair in data]

In [11]:
def preprocess_sentence(sentence):
    # 转换为小写
    sentence = sentence.lower()
    # 去掉除空格外的所有标点符号
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence

# 对德语和英语句子进行预处理
de_sentences = [preprocess_sentence(sentence) for sentence in de_sentences]
en_sentences = [preprocess_sentence(sentence) for sentence in en_sentences]

In [4]:
# 初始化字符级分词器
de_tokenizer = Tokenizer(char_level=True)
en_tokenizer = Tokenizer(char_level=True)

# 拟合分词器
de_tokenizer.fit_on_texts(de_sentences)
en_tokenizer.fit_on_texts(en_sentences)

# 获取字符索引
de_char_index = de_tokenizer.word_index
en_char_index = en_tokenizer.word_index

# 打印字符索引
print("德语字符索引：", de_char_index)
print("英语字符索引：", en_char_index)

德语字符索引： {' ': 1, 'e': 2, 't': 3, 'o': 4, 'a': 5, 'i': 6, 'n': 7, 's': 8, 'h': 9, 'r': 10, 'd': 11, 'l': 12, 'm': 13, '.': 14, 'y': 15, 'u': 16, 'w': 17, 'g': 18, 'c': 19, 'f': 20, 'p': 21, "'": 22, 'b': 23, 'k': 24, 'v': 25, '?': 26, ',': 27, 'j': 28, 'x': 29, 'q': 30, 'z': 31, '"': 32, '0': 33, '-': 34, '!': 35, '3': 36, '1': 37, '2': 38, ':': 39, '5': 40, '9': 41, '6': 42, '8': 43, '4': 44, '7': 45, '$': 46, '%': 47, '’': 48, 'é': 49, ';': 50, '/': 51, '\xa0': 52, '₂': 53, '€': 54, '“': 55, '”': 56, '+': 57, '°': 58, '\xad': 59, 'ü': 60, '—': 61, '(': 62, ')': 63, 'ï': 64, 'ñ': 65, 'ō': 66, '‘': 67, 'à': 68, '\u200b': 69, 'ú': 70, 'â': 71, 'ç': 72, 'ê': 73, 'ã': 74, '@': 75, 'á': 76, '–': 77}
英语字符索引： {' ': 1, 'e': 2, 'i': 3, 'n': 4, 't': 5, 's': 6, 'h': 7, 'r': 8, 'a': 9, 'c': 10, 'd': 11, 'm': 12, 'u': 13, 'l': 14, 'o': 15, '.': 16, 'g': 17, 'w': 18, 'b': 19, 'f': 20, 'k': 21, 'z': 22, ',': 23, 'v': 24, 'ü': 25, '?': 26, 'p': 27, 'ä': 28, 'ö': 29, 'ß': 30, 'j': 31, '!': 32, 'y': 33,

In [15]:
# 将句子转换为字符序列
de_sequences = de_tokenizer.texts_to_sequences(de_sentences)
en_sequences = en_tokenizer.texts_to_sequences(en_sentences)

# 确定最大序列长度
max_de_seq_length = max(len(seq) for seq in de_sequences)
max_en_seq_length = max(len(seq) for seq in en_sequences)
print("最大德语长度:{}".format(max_de_seq_length))

# 填充序列
de_padded_sequences = pad_sequences(de_sequences, maxlen=max_de_seq_length, padding='post')
en_padded_sequences = pad_sequences(en_sequences, maxlen=max_en_seq_length, padding='post')

# 获取字符总数
num_de_chars = len(de_char_index) + 1  # 加1是为了包含padding字符
num_en_chars = len(en_char_index) + 1

# 转换为one-hot向量
de_one_hot = to_categorical(de_padded_sequences, num_classes=num_de_chars)
en_one_hot = to_categorical(en_padded_sequences, num_classes=num_en_chars)

最大德语长度:472


MemoryError: Unable to allocate 76.2 GiB for an array with shape (131164552, 78) and data type float64

~~暂时不知道为什么这么大，~~不过似乎暂时没法继续了。
我猜是因为句子太多了，转换为的矩阵非常大；
在机器翻译时，我们的输入实际上是三维向量（句子数（每60词分隔为一个句子），字符数（60），单词表）

最大德语长度是472，然而据我所知，应该都是短句才对 。

In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

In [None]:
# 定义模型参数
latent_dim = 256  # LSTM单元数

# 编码器
encoder_inputs = Input(shape=(max_de_seq_length,))
encoder_embedding = Embedding(input_dim=num_de_chars, output_dim=latent_dim, input_length=max_de_seq_length)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# 解码器
decoder_inputs = Input(shape=(max_en_seq_length,))
decoder_embedding = Embedding(input_dim=num_en_chars, output_dim=latent_dim, input_length=max_en_seq_length)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_en_chars, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 定义模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 打印模型结构
model.summary()