In [2]:
import torch
import torch.nn as nn

In [6]:
emb = nn.Embedding(10, 20)


In [13]:
emb.weight.data.shape

torch.Size([10, 20])

In [35]:
# 文本字典
sets = [chr(ord("a") + i) for i in range(10)]
vocab = {c:i for i,c in enumerate(sets)}
print(vocab)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9}


In [38]:
# 词汇表输入数据
words = ["bed", "ice", "bee"]
token_index = [[vocab[c] for c in w] for w in words]
token_index

[[1, 4, 3], [8, 2, 4], [1, 4, 4]]

In [40]:
input_data = torch.tensor(token_index)
print(input_data)

tensor([[1, 4, 3],
        [8, 2, 4],
        [1, 4, 4]])


In [41]:
for i in range(10):
    emb.weight.data[i] = i
print(emb.weight.data)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2.],
        [3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3.],
        [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4.],
        [5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
         5., 5.],
        [6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6.,
         6., 6.],
        [7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
         7., 7.],
        [8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8.,
         8., 8.],
        [9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9.,
         9., 9.]])


In [43]:
result = emb(input_data)
result,result.shape # (batch_size, seq_len, embedding_size)

(tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
           1., 1., 1.],
          [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
           4., 4., 4.],
          [3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
           3., 3., 3.]],
 
         [[8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8., 8.,
           8., 8., 8.],
          [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
           2., 2., 2.],
          [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
           4., 4., 4.]],
 
         [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
           1., 1., 1.],
          [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
           4., 4., 4.],
          [4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
           4., 4., 4.]]], grad_fn=<EmbeddingBackward0>),
 torch.Size([3, 3, 20]))

In [None]:
emb2 = nn.Embedding(10,)

In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import re
from collections import Counter
import os

# 1. 数据准备
text = """
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: 
once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 
'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid),
whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, 
when suddenly a White Rabbit with pink eyes ran close by her.
"""


def tokenize(text):
    # 简单的分词方法，您可以根据需要使用更复杂的分词器
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

def build_vocab(tokens, min_freq=1):
    counter = Counter(tokens)
    vocab = {word for word, freq in counter.items() if freq >= min_freq}
    # 添加特殊符号
    vocab = ['<pad>', '<unk>'] + sorted(vocab)
    word_to_ix = {word: idx for idx, word in enumerate(vocab)}
    ix_to_word = {idx: word for word, idx in word_to_ix.items()}
    return word_to_ix, ix_to_word

def encode_tokens(tokens, word_to_ix):
    return [word_to_ix.get(token, word_to_ix['<unk>']) for token in tokens]

# 加载和处理数据
# text = load_text('data.txt')
tokens = tokenize(text)

word_to_ix, ix_to_word = build_vocab(tokens, min_freq=1)
encoded_tokens = encode_tokens(tokens, word_to_ix)

vocab_size = len(word_to_ix)
print(f"词汇表大小: {vocab_size}")

# 3. 创建输入和目标对
def create_context_targets(encoded_tokens, context_size=2):
    inputs = []
    targets = []
    for i in range(len(encoded_tokens) - context_size):
        context = encoded_tokens[i:i+context_size]
        target = encoded_tokens[i+context_size]
        inputs.append(context)
        targets.append(target)
    return inputs, targets

context_size = 2
inputs, targets = create_context_targets(encoded_tokens, context_size)
print(f"样本数量: {len(inputs)}")

# 4. 定义词矩阵表示
embedding_dim_matrix = (5, 5)  # 5x5 矩阵

class WordMatrixEmbedding(nn.Module):
    def __init__(self, vocab_size, matrix_size):
        super(WordMatrixEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, matrix_size[0] * matrix_size[1])
        self.matrix_size = matrix_size

    def forward(self, x):
        # x: (batch_size, seq_length)
        embedded = self.embedding(x)  # (batch_size, seq_length, 25)
        embedded = embedded.view(-1, x.size(1), *self.matrix_size)  # (batch_size, seq_length, 5, 5)
        return embedded

# 5. 定义自定义数据集类
class LanguageModelingDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        # 返回输入序列和目标词
        return torch.tensor(self.inputs[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)

# 6. 创建数据加载器
batch_size = 16
dataset = LanguageModelingDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 7. 定义包含卷积层的 RNN 模型
class CNN_RNN_Model(nn.Module):
    def __init__(self, vocab_size, matrix_size, conv_out_channels, conv_kernel_size, hidden_size, output_size):
        super(CNN_RNN_Model, self).__init__()
        self.word_matrix_embedding = WordMatrixEmbedding(vocab_size, matrix_size)
        self.conv = nn.Conv2d(in_channels=1, 
                              out_channels=conv_out_channels, 
                              kernel_size=conv_kernel_size)
        # 计算卷积输出的尺寸
        conv_height = matrix_size[0] - conv_kernel_size + 1
        conv_width = matrix_size[1] - conv_kernel_size + 1
        self.flatten_size = conv_out_channels * conv_height * conv_width
        self.lstm = nn.LSTM(input_size=self.flatten_size, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        x: (batch_size, seq_length)
        """
        batch_size, seq_length = x.size()
        # 词矩阵嵌入
        embedded = self.word_matrix_embedding(x)  # (batch_size, seq_length, 5, 5)
        # 添加一个维度作为通道维度
        embedded = embedded.unsqueeze(2)  # (batch_size, seq_length, 1, 5, 5)
        # 重塑以适应 Conv2d 输入要求
        embedded = embedded.view(batch_size * seq_length, 1, *self.word_matrix_embedding.matrix_size)  # (batch_size * seq_length, 1, 5, 5)
        # 卷积操作
        conv_out = self.conv(embedded)  # (batch_size * seq_length, conv_out_channels, H_out, W_out)
        conv_out = torch.relu(conv_out)
        # 展平
        conv_out = conv_out.view(batch_size, seq_length, -1)  # (batch_size, seq_length, flatten_size)
        # LSTM
        lstm_out, _ = self.lstm(conv_out)  # (batch_size, seq_length, hidden_size)
        # 取最后一个时间步的输出
        lstm_out = lstm_out[:, -1, :]  # (batch_size, hidden_size)
        # 全连接层
        output = self.fc(lstm_out)  # (batch_size, output_size)
        return output

# 8. 初始化模型、损失函数和优化器
conv_out_channels = 16
conv_kernel_size = 3
hidden_size = 128
output_size = vocab_size

model = CNN_RNN_Model(
    vocab_size=vocab_size,
    matrix_size=embedding_dim_matrix,
    conv_out_channels=conv_out_channels,
    conv_kernel_size=conv_kernel_size,
    hidden_size=hidden_size,
    output_size=output_size
)

# 将模型移动到GPU（如果可用）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 9. 训练模型
num_epochs = 20  # 根据数据集大小和计算资源调整

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_inputs, batch_targets in dataloader:
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        
        # 前向传播
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_targets)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# 10. 测试模型
def predict(model, word_to_ix, ix_to_word, context, device, top_k=5):
    model.eval()
    with torch.no_grad():
        tokens = tokenize(context)
        indices = [word_to_ix.get(token, word_to_ix['<unk>']) for token in tokens]
        if len(indices) < context_size:
            # 填充
            indices = [word_to_ix['<pad>']] * (context_size - len(indices)) + indices
        else:
            indices = indices[-context_size:]
        input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)  # (1, context_size)
        output = model(input_tensor)
        probabilities = torch.softmax(output, dim=1)
        top_probs, top_idxs = torch.topk(probabilities, top_k)
        top_probs = top_probs.cpu().numpy().flatten()
        top_idxs = top_idxs.cpu().numpy().flatten()
        predictions = [(ix_to_word[idx], prob) for idx, prob in zip(top_idxs, top_probs)]
        return predictions

# 示例上下文
contexts = [
    "alice was",
    "she had",
    "the book",
    "what is",
    "a book",
    "to get very"
]

for context in contexts:
    predictions = predict(model, word_to_ix, ix_to_word, context, device)
    predicted_words = ", ".join([f"{word} ({prob:.2f})" for word, prob in predictions])
    print(f'Context: "{context}" -> Predicted Next Words: {predicted_words}')


['alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', 'and', 'of', 'having', 'nothing', 'to', 'do', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', 'thought', 'alice', 'without', 'pictures', 'or', 'conversation', 'so', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', 'as', 'well', 'as', 'she', 'could', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', 'whether', 'the', 'pleasure', 'of', 'making', 'a', 'daisy', 'chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', 'picking', 'the', 'daisies', 'when', 'suddenly', 'a', 'white', 'rabbit', 'with', 'pink', 'eyes', 'ran', 'close', 'by', 'her']


RuntimeError: No active exception to reraise