In [6]:
import os
import re
import gensim
import pickle
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize

In [None]:
def index2vector(word2index_file, index2vectorex2vec_out, dim, scale, seed=0):
    print("index2vec", word2index_file)
    """加载word2index文件"""
    word2index = pickle.load(open(word2index_file, "rb"))
    """定义词汇表大小"""
    vocab_size = len(word2index)
    """定义索引-向量的列表，并初始化为0向量"""
    index2vec = np.zero((vocab_size + 1, dim), dtype="float32")
    """将代表padding的向量设为0向量"""
    index2vec[0] = np.zero(dim)
    """设置随机数seed"""
    np.random.seed(seed)
    """对所有的词汇设置随机向量"""
    for word in word2index:
        index = word2index[word]
        index2vec[index] = np.random.uniform(-scale, scale, dim)
    """存储到文件"""
    pickle.dump(np.asarray(idnex2vec), open(index2vec_out, "wb"))

In [None]:
def index2vector_pretained(word2index_file, index2vec_out, dim, scale, seed=0):
    print("index2vector_pretained", word2index_file)
    """加载谷歌预训练集"""
    word_emb = gensim.models.KeyedVectors.load_word2vec_format("D:\\BaiduNetdiskDownload\\GoogleNews-vectors-negative300\\GoogleNews-vectors-negative300.bin", binary=True)
    """加载word2index文件"""
    word2index = pickle.load(open(word2index_file, "rb"))
    """定义词汇表大小"""
    vocab_size = len(word2index)
    """定义索引-向量的列表，并初始化为0向量"""
    index2vec = np.zeros((vocab_size + 1, dim), dtype="float32")
    """将代表padding的向量设为0向量"""
    index2vec[0] = np.zeros(dim)  # 0 used for padding
    """设置随机数seed"""
    np.random.seed(seed)
    """未知词汇记录"""
    unknown_words = 0
    """对所有的词汇使用预训练集进行初始化，未知词汇随机初始化"""
    for word in word2index:
        index = word2index[word]
        try:
            cur_vec = word_emb.get_vector(word)
        except Exception:
            """未知词汇采用随机初始化，并计数加一"""
            cur_vec = np.random.uniform(-scale, scale, dim)
            unknown_words += 1
        index2vec[index] = cur_vec
    print("total words : ", vocab_size)
    print("unknown words : ", unknown_words)
    """存储到文件"""
    pickle.dump(np.asarray(index2vec), open(index2vec_out, "wb"))


In [None]:
def pad_forward(in_file, out_file, data_length):
    print("pad_forward", in_file)
    nn_data = []
    """加载输入文件"""
    for data in pickle.load(open(in_file, 'rb')):
        print(data)
        sent = data[1]
        if len(sent) >= data_length: # 截断
            sent = sent[:data_length]
        else: # 填充
            pad = [0] * (data_length - len(sent))
            sent = pad + sent
        nn_data.append([data[0], sent, data[2]])
    pickle.dump(nn_data, open(out_file, 'wb'))

In [None]:
def preprocess_project(project_name):
    if not os.path.exists(project_name):
        os.mkdir(project_name)
    
    """读文件"""
    input_file = open("dataset/" + project_name + ".pkl", "rb")
    info = pickle.load(input_file)
    input_file.close()
    
    """创建列表"""
    key = list()
    key_id = list()
    summary = list()
    description = list()
    
    """收集数据"""
    for i in range(len(info)):
        key.append(info[i]['key'])
        cur_index = info[i]['key'].find("-")
        key_id.append(int(info[i]['key'][cur_index+1:]))
        summary.append(info[i]['fields']['summary'])
        description.append(info[i]['fields']['description'])
    
    """转换为numpy矩阵"""
    key = np.array(key)
    key_id = np.array(key_id)
    summary = np.array(summary)
    description = np.array(description)
    
    """元素从小到大排序后提取缩索引"""
    indicies = np.argsort(key_id)
    key = key[indicies]
    summary = summary[indicies]
    description = description[indicies]
    
    """合并description和summary，并分词"""
    summary_descriptions = list()
    for i in range(len(summary)):
        cur_summary = summary[i]
        cur_description = description[i]
        summary_words = list(word_tokenize(cur_summary))
        summary_words = [word.lower() for word in summary_words]
        try:
            description_words = list(word_tokenize(cur_description))
            description_words = [word.lower() for word in description_words]
        except Exception:
            description_words = list()
        summary_descriptions.append(summary_words+description_words)
    summary_descriptions = np.array(summary_descriptions)
    
    """将数字和.换为<NUM>"""
    processed_summary_descriptions = []
    pattern_1 = re.compile("^[0-9\.]+$")
    for i in range(len(summary_descriptions)):
        cur_summary_description = []
        for word in summary_descriptions[i]:
            if pattern_1.match(word) is not None:
                cur_summary_description.append("<NUM>")
                continue
            cur_summary_description.append(word)
        processed_summary_descriptions.append(cur_summary_description)
    processed_summary_descriptions = np.array(processed_summary_descriptions)
    
    """读取label并整理为1/0"""
    info = pd.read_csv("dataset/" + project_name + "_classification_vs_type.csv")
    label = list((info['CLASSIFIED'] == 'BUG').astype(int))
    label = np.array(label)
    label = label[indicies]
    
    """将整合后的summary_description转换为索引序列"""
    word2index = {} # 单词-索引映射表
    index_label = list() # 记录所有文档，element eg：list[文档id, 文档内单词索引序列, 标签]
    index = 1 # 0用于padding
    for i in range(len(processed_summary_descriptions)):
        sent_index = [] # 用于记录当前文档内单词索引的序列
        for word in processed_summary_descriptions[i]:
            if word not in word2index:
                word2index[word] = index
                sent_index.append(index)
                index+=1
            else:
                sent_index.append(word2index[word])
        index_label.append([key[i], sent_index, label[i]])
        
    """将word2index、index_label写入文件"""
    pickle.dump(word2index, open(project_name + "/word2index.pkl", 'wb'))
    pickle.dump(index_label, open(project_name + "/index_label.pkl", 'wb'))
    
    """词向量化，并进行padding操作"""
    index2vector(project_name + "/word2index.pkl", project_name + "/index2vec.pkl", dim=100, scale=0.1)
    index2vector_pretained(project_name + "/word2index.pkl", project_name + "/index2vec_pt.pkl", dim=300, scale=0.1)
    pad_forward(project_name + "/index_label.pkl", project_name + "/index_label_nn.pkl", data_length=100)

    """加载经过padding的文件"""
    input_file = open(project_name + "/index_label_nn.pkl", "rb")
    data = pickle.load(input_file)
    input_file.close()
    
    """划分训练集、验证集、测试集"""
    train_valid_data = data[:int(len(data) * 0.9)] # 90%的数据用于训练和验证
    train_valid_data = shuffle(train_valid_data, random_state=0) # shuffle
    train_data = train_valid_data[:int(len(train_valid_data) * 0.9)] # 81%的数据用于训练
    valid_data = train_valid_data[int(len(train_valid_data) * 0.9):] # 9%的数据用于验证
    test_data = data[int(len(data) * 0.9):] # 其余10%的数据用于测试
    """将三个数据集存储到文件"""
    pickle.dump(train_data, open(project_name + "/train_nn.pkl", 'wb'))
    pickle.dump(valid_data, open(project_name + "/valid_nn.pkl", 'wb'))
    pickle.dump(test_data, open(project_name + "/test_nn.pkl", 'wb'))

In [None]:
def preprocess_all():
    if not os.path.exists("all"):
        os.mkdir("all")

    word2index = {}
    index_label = []
    index = 1  # 0 used for padding

    for project_name in ["jackrabbit", "lucene", "httpclient"]:
        # 打开文件
        input_file = open("dataset/" + project_name + ".pkl", "rb")
        info = pickle.load(input_file)
        input_file.close()

        # 提取key、summary、description
        key = []
        summary = []
        description = []
        for i in range(len(info)):
            key.append(info[i]['key'])
            summary.append(info[i]['fields']['summary'])
            description.append(info[i]['fields']['description'])

        # 组合summary和description
        summary_descriptions = []
        for i in range(len(summary)):
            cur_summary = summary[i]
            cur_description = description[i]
            summary_words = list(word_tokenize(cur_summary))
            summary_words = [word.lower() for word in summary_words]
            try:
                description_words = list(word_tokenize(cur_description))
                description_words = [word.lower() for word in description_words]
            except Exception:
                description_words = []
            summary_descriptions.append(summary_words + description_words)
        summary_descriptions = np.array(summary_descriptions)

        # 将summary_description中的数字和.转换为<NUM>标识符
        processed_summary_descriptions = []
        pattern1 = re.compile("^[0-9\.]+$")
        for i in range(len(summary_descriptions)):
            cur_summary_description = []
            for word in summary_descriptions[i]:
                if pattern1.match(word) is not None:
                    cur_summary_description.append("<NUM>")
                    continue
                cur_summary_description.append(word)
            processed_summary_descriptions.append(cur_summary_description)
        processed_summary_descriptions = np.array(processed_summary_descriptions)

        # 从文件中提取出标签label并标记为0/1
        info = pd.read_csv("dataset/" + project_name + "_classification_vs_type.csv")
        label = list((info['CLASSIFIED'] == "BUG").astype(int))
        
        # 将单词映射到索引
        for i in range(len(processed_summary_descriptions)):
            sent_index = []
            for word in processed_summary_descriptions[i]:
                if word not in word2index:
                    word2index[word] = index # 将新的索引添加到总表
                    sent_index.append(index)
                    index += 1
                else:
                    sent_index.append(word2index[word])
            index_label.append([key[i], sent_index, label[i]]) # 将新的文档信息添加到总表
    # 将word2index和index_label存储到文件
    pickle.dump(word2index, open("all/word2index.pkl", 'wb'))
    pickle.dump(index_label, open("all/index_label.pkl", 'wb'))

    # 词向量话和padding操作
    index2vector("all/word2index.pkl", "all/index2vec.pkl", dim=100, scale=0.1)
    index2vector_pretained("all/word2index.pkl", "all/index2vec_pt.pkl", dim=300, scale=0.1)
    pad_forward("all/index_label.pkl", "all/index_label_nn.pkl", data_length=100)

    # 加载经过padding的文件
    input_file = open("all/index_label_nn.pkl", "rb")
    data = pickle.load(input_file)
    input_file.close()
    
    # 提取出所有的key_id, 并按递增顺序进行排序
    info = []
    for project_name in ['jackrabbit', 'lucene', 'httpclient']:
        input_file = open("dataset/" + project_name + ".pkl", "rb")
        info += pickle.load(input_file)
        input_file.close()
    key_id = []
    for i in range(len(info)):
        key_id.append(info[i]['id'])
    key_id = np.array(key_id).astype(int)
    indicies = np.argsort(key_id)
    
    # 划分训练集、验证集、测试集
    train_valid_indicies = indicies[:int(len(indicies) * 0.9)] # 90%用于训练和验证
    test_indicies = indicies[int(len(indicies) * 0.9):] # 10%用于测试
    train_valid_data = [data[i] for i in train_valid_indicies] # 分离出训练验证集
    test_data = [data[i] for i in test_indicies] # 分离出测试集
    train_valid_data = shuffle(train_valid_data, random_state=0) # 打乱训练验证集
    train_data = train_valid_data[:int(len(train_valid_data) * 0.95)] # 85.5%用于训练
    valid_data = train_valid_data[int(len(train_valid_data) * 0.95):] # 其余4.5%用于验证
    # 存储到文件
    pickle.dump(train_data, open("all/train_nn.pkl", 'wb'))
    pickle.dump(valid_data, open("all/valid_nn.pkl", 'wb'))
    pickle.dump(test_data, open("all/test_nn.pkl", 'wb'))

In [None]:
if __name__ == "__main__":
    print("preprocess jackrabbit")
    preprocess_project("jackrabbit")
    print("preprocess lucene")
    preprocess_project("lucene")
    print("preprocess httpclient")
    preprocess_project("httpclient")
    print("preprocess all")
    preprocess_all()