In [1]:
pip install nltk

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
# 导入所需的库
import json
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import numpy as np
import random
import math
import time
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# 定义数据集类，用于加载和处理数据
class ConcodeDataset(Dataset):
    def __init__(self, data_file):
        with open(data_file, 'r') as f:
            self.data = [json.loads(line) for line in f]  # 读取并解析JSON数据
        self.build_vocab()  # 构建词汇表

    def build_vocab(self):
        # 初始化输入和输出词汇表，并添加特殊标记
        self.input_vocab = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2}
        self.output_vocab = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2}
        input_idx = len(self.input_vocab)
        output_idx = len(self.output_vocab)
        
        # 遍历数据集，构建输入和输出的词汇表
        for item in self.data:
            for word in item['nl'].split():
                if word not in self.input_vocab:
                    self.input_vocab[word] = input_idx
                    input_idx += 1
            for word in item['code'].split():
                if word not in self.output_vocab:
                    self.output_vocab[word] = output_idx
                    output_idx += 1
        self.rev_output_vocab = {idx: word for word, idx in self.output_vocab.items()}  # 构建反向词汇表

    def __len__(self):
        return len(self.data)  # 返回数据集的长度

    def __getitem__(self, index):
        # 根据索引获取输入和输出序列，并转换为ID表示
        input_seq = self.data[index]['nl'].split()
        output_seq = self.data[index]['code'].split()
        input_ids = [self.input_vocab['<SOS>']] + [self.input_vocab[word] for word in input_seq] + [self.input_vocab['<EOS>']]
        output_ids = [self.output_vocab['<SOS>']] + [self.output_vocab[word] for word in output_seq] + [self.output_vocab['<EOS>']]
        return torch.tensor(input_ids), torch.tensor(output_ids), self.data[index]['nl'], self.data[index]['code']

# 定义编码器类
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)  # 嵌入层，将词汇ID转换为向量表示
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=True)  # 双向GRU层
        self.dropout = nn.Dropout(dropout)  # Dropout层
        self.fc = nn.Linear(hid_dim * 2, hid_dim)  # 全连接层，用于将双向GRU的输出维度减少一半

    def forward(self, src, src_lengths):
        embedded = self.dropout(self.embedding(src))  # 将输入序列嵌入并应用Dropout
        packed_embedded = pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=False)  # 打包嵌入序列
        packed_outputs, hidden = self.rnn(packed_embedded)  # 通过GRU层
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)  # 解包GRU层的输出
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))  # 处理双向GRU的隐藏状态
        hidden = hidden.unsqueeze(0).repeat(self.rnn.num_layers, 1, 1)  # 调整隐藏状态的形状
        return outputs, hidden

# 定义注意力机制类
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 3, hid_dim)  # 注意力权重计算层
        self.v = nn.Linear(hid_dim, 1, bias=False)  # 注意力得分计算层

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]  # 获取源序列长度
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # 扩展隐藏状态
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # 计算注意力能量
        attention = self.v(energy).squeeze(2)  # 计算注意力得分
        return F.softmax(attention, dim=1)  # 计算注意力权重

# 定义解码器类
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention  # 引入注意力机制
        self.embedding = nn.Embedding(output_dim, emb_dim)  # 嵌入层
        self.rnn = nn.GRU(emb_dim + hid_dim * 2, hid_dim, n_layers, dropout=dropout)  # 带注意力的GRU层
        self.fc_out = nn.Linear(emb_dim + hid_dim * 3, output_dim)  # 输出层
        self.dropout = nn.Dropout(dropout)  # Dropout层

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)  # 扩展输入序列的维度
        embedded = self.dropout(self.embedding(input))  # 嵌入并应用Dropout
        a = self.attention(hidden[-1], encoder_outputs).unsqueeze(1)  # 计算注意力权重
        weighted = torch.bmm(a, encoder_outputs)  # 计算加权编码器输出
        rnn_input = torch.cat((embedded, weighted.transpose(0, 1)), dim=2)  # 拼接嵌入和加权编码器输出
        output, hidden = self.rnn(rnn_input, hidden)  # 通过GRU层
        hidden = hidden.squeeze(0)
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(1)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))  # 生成预测结果
        return prediction, hidden

# 定义序列到序列模型类
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_lengths, trg=None, teacher_forcing_ratio=0.75, max_len=100):
        trg_len = trg.shape[1] if trg is not None else max_len
        batch_size = src.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)  # 初始化输出张量
        encoder_outputs, hidden = self.encoder(src, src_lengths)  # 编码器处理输入序列
        input = trg[:, 0] if trg is not None else torch.tensor([self.decoder.output_dim-2]*batch_size).to(self.device)  # 初始化解码器输入
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)  # 解码器处理输入并生成输出
            outputs[:, t] = output
            top1 = output.argmax(1)  # 获取当前时间步的预测结果
            input = trg[:, t] if trg is not None and random.random() < teacher_forcing_ratio else top1  # 决定是否使用教师强制
            if (top1 == self.decoder.output_dim-1).all():  # 如果预测到结束标记，则停止解码
                break
        return outputs

# 训练模型函数
def train(model, iterator, optimizer, criterion, clip):
    model.train()  # 设置模型为训练模式
    epoch_loss = 0
    for i, batch in enumerate(tqdm(iterator, desc="Training")):
        src, src_lengths, trg, _, codes = batch
        src = src.to(model.device)
        src_lengths = src_lengths.to(model.device)
        trg = trg.to(model.device)
        optimizer.zero_grad()  # 清除梯度
        output = model(src, src_lengths, trg)  # 前向传播
        output_dim = output.shape[-1]
        output = output[:, 1:].contiguous().view(-1, output_dim)  # 调整输出张量的形状
        trg = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg)  # 计算损失
        loss.backward()  # 反向传播计算梯度
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # 梯度裁剪，防止梯度爆炸
        optimizer.step()  # 更新模型参数
        epoch_loss += loss.item()  # 累加损失
    return epoch_loss / len(iterator)  # 返回平均损失

# 评估模型函数
def evaluate(model, iterator, criterion, concode_dataset, epoch):
    model.eval()  # 设置模型为评估模式
    epoch_loss = 0
    all_references = []
    all_candidates = []
    with torch.no_grad():  # 禁用梯度计算
        for i, batch in enumerate(tqdm(iterator, desc="Evaluating")):
            src, src_lengths, trg, trg_texts, codes_texts = batch
            src = src.to(model.device)
            trg = trg.to(model.device)
            src_lengths = src_lengths.to(model.device)
            output = model(src, src_lengths, max_len=trg.shape[1])  # 前向传播
            output_dim = output.shape[-1]
            output_convert = output[:, 1:].contiguous().view(-1, output_dim)  # 调整输出张量的形状
            trg = trg[:, 1:].contiguous().view(-1)
            loss = criterion(output_convert, trg)  # 计算损失
            epoch_loss += loss.item()  # 累加损失
            predictions = torch.argmax(output, dim=2)  # 获取预测结果
            references = [text.split() for text in codes_texts]  # 参考译文
            candidates = [[concode_dataset.rev_output_vocab[idx.item()] for idx in pred if idx.item() in concode_dataset.rev_output_vocab and idx.item() not in {0, 1, 2}] for pred in predictions]  # 生成译文
            all_references.extend(references)
            all_candidates.extend(candidates)
    references_filename = f'codebleu/references_{epoch}.txt'
    candidates_filename = f'codebleu/hypothesis_{epoch}.txt'
    with open(references_filename, 'w') as ref_file:
        for ref in all_references:
            ref_file.write(' '.join(ref) + '\n')
    with open(candidates_filename, 'w') as cand_file:
        for cand in all_candidates:
            cand_file.write(' '.join(cand) + '\n')
    smooth_func = SmoothingFunction().method4
    all_references = [[ref] for ref in all_references]
    bleu4 = corpus_bleu(all_references, all_candidates, smoothing_function=smooth_func)  # 计算BLEU-4分数
    return epoch_loss / len(iterator), bleu4  # 返回平均损失和BLEU-4分数

# 加载数据函数
def load_data(data_file, batch_size, collate_fn):
    dataset = ConcodeDataset(data_file)  # 创建数据集对象
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)  # 创建数据加载器

# 数据对齐函数，用于处理变长序列
def collate_fn(batch):
    src, trg, nls, codes = zip(*batch)
    src_lengths = torch.tensor([len(s) for s in src], dtype=torch.int64)
    src = pad_sequence(src, padding_value=0, batch_first=True)  # 填充输入序列
    trg = pad_sequence(trg, padding_value=0, batch_first=True)  # 填充输出序列
    return src, src_lengths, trg, nls, codes

# 主函数，定义并训练模型
def main():
    data_file = 'train.json'
    concode_dataset = ConcodeDataset(data_file)  # 创建数据集对象
    INPUT_DIM = len(concode_dataset.input_vocab) + 1  # 输入词汇表大小
    OUTPUT_DIM = len(concode_dataset.output_vocab) + 1  # 输出词汇表大小
    ENC_EMB_DIM = 128  # 编码器嵌入维度
    DEC_EMB_DIM = 128  # 解码器嵌入维度
    HID_DIM = 256  # 隐藏层维度
    N_LAYERS = 2  # RNN层数
    ENC_DROPOUT = 0.2  # 编码器Dropout率
    DEC_DROPOUT = 0.2  # 解码器Dropout率
    BATCH_SIZE = 32  # 批量大小
    N_EPOCHS = 10  # 训练轮数
    CLIP = 1  # 梯度裁剪值
    LEARNING_RATE = 0.001  # 学习率
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # 选择设备
    train_data = load_data('train.json', BATCH_SIZE, collate_fn)  # 加载训练数据
    valid_data = load_data('dev.json', BATCH_SIZE, collate_fn)  # 加载验证数据
    attention = Attention(HID_DIM).to(device)  # 创建注意力机制对象
    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)  # 创建编码器对象
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, attention).to(device)  # 创建解码器对象
    model = Seq2Seq(enc, dec, device).to(device)  # 创建序列到序列模型对象
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)  # 创建优化器
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # 定义损失函数
    for epoch in range(N_EPOCHS):
        train_loss = train(model, train_data, optimizer, criterion, CLIP)  # 训练模型
        valid_loss, bleu4 = evaluate(model, valid_data, criterion, concode_dataset, epoch)  # 评估模型
        print(f'Epoch: {epoch + 1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {valid_loss:.3f}, BLEU-4: {bleu4:.3f}')
    torch.save(model.state_dict(), 'codebleu_model/seq2seq_gru_model.pt')  # 保存模型

    # 测试模型
    test_data = load_data('test.json', BATCH_SIZE, collate_fn)
    model.eval()  # 设置模型为评估模式
    results = []
    with torch.no_grad():
        for batch in tqdm(test_data, desc="Testing"):
            src, src_lengths, _, nls, _ = batch
            src = src.to(device)
            src_lengths = src_lengths.to(device)
            output = model(src, src_lengths, max_len=100)  # 前向传播
            predictions = output.argmax(2).transpose(0, 1)  # 获取预测结果
            for pred, nl in zip(predictions.cpu().numpy().tolist(), nls):
                result = {
                    "code": " ".join([concode_dataset.rev_output_vocab[idx] for idx in pred if idx in concode_dataset.rev_output_vocab and idx != 2]),  # 跳过<EOS>标记
                    "nl": nl
                }
                results.append(result)
    with open('results_1120213587_周圣威.json', 'w') as f:
        json.dump(results, f, indent=4)  # 保存测试结果为JSON文件

if __name__ == '__main__':
    main()


Training: 100%|██████████| 3125/3125 [45:20<00:00,  1.15it/s]
Evaluating: 100%|██████████| 63/63 [00:27<00:00,  2.33it/s]


Epoch: 01, Train Loss: 2.440, Val. Loss: 9.493, BLEU-4: 0.054


Training: 100%|██████████| 3125/3125 [45:10<00:00,  1.15it/s]
Evaluating: 100%|██████████| 63/63 [00:27<00:00,  2.33it/s]


Epoch: 02, Train Loss: 1.662, Val. Loss: 10.000, BLEU-4: 0.047


Training: 100%|██████████| 3125/3125 [45:22<00:00,  1.15it/s]
Evaluating: 100%|██████████| 63/63 [00:27<00:00,  2.30it/s]


Epoch: 03, Train Loss: 1.318, Val. Loss: 10.190, BLEU-4: 0.053


Training: 100%|██████████| 3125/3125 [45:09<00:00,  1.15it/s]
Evaluating: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]


Epoch: 04, Train Loss: 1.124, Val. Loss: 10.396, BLEU-4: 0.053


Training: 100%|██████████| 3125/3125 [45:09<00:00,  1.15it/s]
Evaluating: 100%|██████████| 63/63 [00:26<00:00,  2.35it/s]


Epoch: 05, Train Loss: 1.041, Val. Loss: 10.675, BLEU-4: 0.053


Training: 100%|██████████| 3125/3125 [45:10<00:00,  1.15it/s]
Evaluating: 100%|██████████| 63/63 [00:26<00:00,  2.37it/s]


Epoch: 06, Train Loss: 0.991, Val. Loss: 10.840, BLEU-4: 0.058


Training:  19%|█▊        | 584/3125 [08:24<40:24,  1.05it/s]