In [1]:
import os
import re
import gensim
import pickle
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
#分别对summary和description建立词典

In [2]:
output_dir = 'Model Input'
stop_words_path = 'dataset/english'

In [3]:
def remove_stop_words(inputs, stop_words_path):
    '''
    去除停用词
    :param inputs: [word1 word2...]
    :param stop_words_path:
    :return:
    '''
    with open(stop_words_path, "r", encoding="utf-8") as fr:
        stop_words = [line.strip() for line in fr.readlines()]

    outputs = [word for word in inputs if word not in stop_words]
    return outputs

In [4]:
def stem_words(inputs):
    '''
    词干化处理
    :param inputs: [word1 word2...]
    :return:
    '''
    stemmer = SnowballStemmer('english')
    # stemmer = PorterStemmer()
    outputs = [stemmer.stem(word) for word in inputs]
    return outputs

In [5]:
def pad_forward(in_file, out_file, summary_len, description_len):
    '''对词向量进行截取或者填充'''
    print("pad_forward", in_file)
    nn_data = []
    for data in pickle.load(open(in_file, 'rb')):
        summary_sent = data[1]
        description_sent = data[2]
        if len(summary_sent) >= summary_len:
            summary_sent = summary_sent[:summary_len]
        else:
            pad = [0] * (summary_len - len(summary_sent))
            summary_sent = pad + summary_sent

        if len(description_sent) >= description_len:
            description_sent = description_sent[:description_len]
        else:
            pad = [0] * (description_len - len(description_sent))
            description_sent = pad + description_sent
        nn_data.append([data[0], summary_sent, description_sent, data[3], data[4]])
    pickle.dump(nn_data, open(out_file, 'wb'))

In [20]:
def preprocess_project(project_name):

    project_dir = output_dir + '\\' + project_name
    if not os.path.exists(project_dir):
        os.makedirs(project_dir)

    #读取pkl文件
    input_file = open("dataset/" + project_name + ".pkl", "rb")
    info = pickle.load(input_file)
    input_file.close()

    #处理数据
    key = []
    key_id = []
    summary = []
    description = []
    priority = []
    for i in range(len(info)):
        key.append(info[i]['key']) #'HTTPCLIENT-569'
        cur_index = info[i]['key'].find("-") #10
        key_id.append(int(info[i]['key'][cur_index + 1:])) #569
        summary.append(info[i]['fields']['summary'])
        description.append(info[i]['fields']['description'])
        priority.append(info[i]['fields']['priority']['id'])
    key = np.array(key)
    key_id = np.array(key_id)
    summary = np.array(summary)
    description = np.array(description)
    priority = np.array(priority)
    """
    key: 软件缺陷报告的key
    key_id：软件缺陷报告的key_id
    summary：软件缺陷报告的摘要
    description：件缺陷报告的描述
    priority：软件缺陷报告的优先级
    """

    """将所有元素按照key_id升序进行排序"""
    indicies = np.argsort(key_id) #元素从小到大排列后提取索引
    key = key[indicies]
    summary = summary[indicies]
    description = description[indicies]
    priority = priority[indicies]

    #将summary、description分别分词、去停用词
    summary_tokenize = []
    description_tokenize = []
    for i in range(len(summary)):
        cur_summary = summary[i]
        cur_description = description[i]
        """处理摘要"""
        summary_words = list(word_tokenize(cur_summary)) # 分词
        summary_words = [word.lower() for word in summary_words] # 转换为小写
        summary_words = remove_stop_words(summary_words, stop_words_path) # 去除停用词
        summary_words = stem_words(summary_words) # 词干化
        try:
            """处理描述"""
            description_words = list(word_tokenize(cur_description)) # 分词
            description_words = [word.lower() for word in description_words] # 转换为小写
            description_words = remove_stop_words(description_words, stop_words_path) # 去除停用词
            description_words = stem_words(description_words) # 词干化
        except Exception:
            description_words = []
        """将处理后的摘要和描述放入切分词列表"""
        summary_tokenize.append(summary_words)
        description_tokenize.append(description_words)
    summary_processed = np.array(summary_tokenize)
    description_processed = np.array(description_tokenize)


    #读取label并整理 classified为bug的值为1，否则为0
    info = pd.read_csv("dataset/" + project_name + "_classification_vs_type.csv")
    label = list((info['CLASSIFIED'] == "BUG").astype(int))
    label = np.array(label)
    label = label[indicies]

    word2index_summary = {} # word: index
    word2index_description = {} # 单词到索引的映射
    
    index_label = [] #[key,[summary_index...],[description_index...],label]
    index_summary = 1  # 0 used for padding
    index_description = 1
    for i in range(len(summary_processed)):
        summary_index = []
        description_index = []

        for word in summary_processed[i]:
            if word not in word2index_summary:
                word2index_summary[word] = index_summary
                summary_index.append(index_summary)
                index_summary += 1
            else:
                summary_index.append(word2index_summary[word])

        for word in description_processed[i]:
            if word not in word2index_description:
                word2index_description[word] = index_description
                description_index.append(index_description)
                index_description += 1
            else:
                description_index.append(word2index_description[word])

        index_label.append([key[i], summary_index, description_index, priority[i], label[i]])

    print(len(word2index_summary))
    print(len(word2index_description))
    print(len(index_label))
    #将word2index、index_label写入文件
    pickle.dump(word2index_summary, open(project_dir + "/word2index_summary.pkl", 'wb'))
    pickle.dump(word2index_description, open(project_dir + "/word2index_description.pkl", 'wb'))
    pickle.dump(index_label, open(project_dir + "/index_label.pkl", 'wb'))

    # 对句向量进行截取或填充
    pad_forward(project_dir + "/index_label.pkl", project_dir + "/index_label_nn.pkl",
                summary_len=50, description_len=100)

    #将整理好的数据划分为训练集、验证集、测试集
    input_file = open(project_dir + "/index_label_nn.pkl", "rb")
    data = pickle.load(input_file)
    input_file.close()
    
    theme_file = open("LDA theme/" + project_name + "_theme.pkl", "rb")
    theme_data = pickle.load(theme_file)
    print(len(theme_data))
    theme_file.close()
    
    theme_test_file = open("LDA theme/" + project_name + "_theme_test.pkl", "rb")
    theme_test_data = pickle.load(theme_test_file)
    print(len(theme_test_data))
    theme_test_file.close()
    
    
    train_valid_data = data[:int(len(data) * 0.9)]
    
    
    print(len(train_valid_data))
    print(len(theme_data))
    
    for i in range(len(train_valid_data)):
        if i == 0: print(train_valid_data[0], theme_data[i])
        train_valid_data[i].insert(4, theme_data[i])
        
    train_valid_data = shuffle(train_valid_data, random_state=0)
    train_data = train_valid_data[:int(len(train_valid_data) * 0.9)]
    valid_data = train_valid_data[int(len(train_valid_data) * 0.9):]
    test_data = data[int(len(data) * 0.9):]
    
    for i in range(len(test_data)):
        test_data[i].insert(4, theme_test_data[i])
    
    
    
    
    
    print(train_valid_data[0])
    
    
    
    
    
    print('train:',len(train_data))
    print('validation', len(valid_data))
    print('test', len(test_data))
    pickle.dump(train_data, open(project_dir + "/train_nn.pkl", 'wb'))
    pickle.dump(valid_data, open(project_dir + "/valid_nn.pkl", 'wb'))
    pickle.dump(test_data, open(project_dir + "/test_nn.pkl", 'wb'))



In [21]:
def preprocess_all():

    all_dir = output_dir + '\\' + 'all'
    if not os.path.exists(all_dir):
        os.makedirs(all_dir)

    word2index_summary = {}  # word: index
    word2index_description = {}
    index_label = []
    index_summary = 1  # 0 used for padding
    index_description = 1
    print('t1')

    for project_name in ["jackrabbit", "lucene", "httpclient"]:
        print('t2')
        input_file = open("dataset/" + project_name + ".pkl", "rb")
        info = pickle.load(input_file)
        input_file.close()

        key = []
        summary = []
        description = []
        priority = []
        for i in range(len(info)):
            key.append(info[i]['key'])
            summary.append(info[i]['fields']['summary'])
            description.append(info[i]['fields']['description'])
            priority.append(info[i]['fields']['priority']['id'])

        # 将summary、description分别分词、去停用词
        summary_tokenize = []
        description_tokenize = []
        for i in range(len(summary)):
            cur_summary = summary[i]
            cur_description = description[i]
            summary_words = list(word_tokenize(cur_summary))
            summary_words = [word.lower() for word in summary_words]
            summary_words = remove_stop_words(summary_words, stop_words_path)
            summary_words = stem_words(summary_words)
            try:
                description_words = list(word_tokenize(cur_description))
                description_words = [word.lower() for word in description_words]
                description_words = remove_stop_words(description_words, stop_words_path)
                description_words = stem_words(description_words)
            except Exception:
                description_words = []
            summary_tokenize.append(summary_words)
            description_tokenize.append(description_words)
        summary_processed = np.array(summary_tokenize)
        description_processed = np.array(description_tokenize)

        info = pd.read_csv("dataset/" + project_name + "_classification_vs_type.csv")
        label = list((info['CLASSIFIED'] == "BUG").astype(int))

        for i in range(len(summary_processed)):
            summary_index = []
            description_index = []

            for word in summary_processed[i]:
                if word not in word2index_summary:
                    word2index_summary[word] = index_summary
                    summary_index.append(index_summary)
                    index_summary += 1
                else:
                    summary_index.append(word2index_summary[word])

            for word in description_processed[i]:
                if word not in word2index_description:
                    word2index_description[word] = index_description
                    description_index.append(index_description)
                    index_description += 1
                else:
                    description_index.append(word2index_description[word])

            index_label.append([key[i], summary_index, description_index, priority[i], label[i]])

    # 将word2index、index_label写入文件
    pickle.dump(word2index_summary, open(all_dir + "/word2index_summary.pkl", 'wb'))
    pickle.dump(word2index_description, open(all_dir + "/word2index_description.pkl", 'wb'))
    pickle.dump(index_label, open(all_dir + "/index_label.pkl", 'wb'))

    # 对句向量进行截取或填充
    pad_forward(all_dir + "/index_label.pkl", all_dir + "/index_label_nn.pkl",
                summary_len=50, description_len=100)

    # 将整理好的数据划分为训练集、验证集、测试集
    input_file = open(all_dir + "/index_label_nn.pkl", "rb")
    data = pickle.load(input_file)
    input_file.close()

    info = []
    for project_name in ['jackrabbit', 'lucene', 'httpclient']:
        input_file = open("dataset/" + project_name + ".pkl", "rb")
        info += pickle.load(input_file)
        input_file.close()

    key_id = []
    for i in range(len(info)):
        key_id.append(info[i]['id'])
    key_id = np.array(key_id).astype(int)
    indicies = np.argsort(key_id)

    train_valid_indicies = indicies[:int(len(indicies) * 0.9)]
    test_indicies = indicies[int(len(indicies) * 0.9):]

    train_valid_data = [data[i] for i in train_valid_indicies]
    test_data = [data[i] for i in test_indicies]

    train_valid_data = shuffle(train_valid_data, random_state=0)
    train_data = train_valid_data[:int(len(train_valid_data) * 0.95)]
    valid_data = train_valid_data[int(len(train_valid_data) * 0.95):]

    pickle.dump(train_data, open(all_dir + "/train_nn.pkl", 'wb'))
    pickle.dump(valid_data, open(all_dir + "/valid_nn.pkl", 'wb'))
    pickle.dump(test_data, open(all_dir + "/test_nn.pkl", 'wb'))


In [22]:
if __name__ == "__main__":
    print("preprocess jackrabbit")
    preprocess_project("jackrabbit")
    print("preprocess lucene")
    preprocess_project("lucene")
    print("preprocess httpclient")
    preprocess_project("httpclient")
#     print("preprocess all")
#     preprocess_all()

preprocess jackrabbit
3038
17336
2402
pad_forward Model Input\jackrabbit/index_label.pkl
2161
241
2161
2161
['JCR-2', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 7, 19, 20, 21, 22, 23, 16, 1, 24, 25, 11, 21, 26, 27, 28, 29, 28, 30, 28, 31, 28, 32, 28, 33, 28, 34, 35, 7, 36, 21, 37, 30, 38, 29, 39, 40, 41, 42, 7, 43, 23, 44, 45, 46, 47, 26, 48, 49, 1, 50, 51, 52, 7, 53, 28, 23, 54, 55, 56, 57, 51, 58, 35, 59, 60, 61, 62, 26, 63, 12, 64, 1, 56, 7, 65, 23, 66, 1, 67, 68, 69, 70], '4', 0] [0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]
['JCR-210', [0, 0, 0, 0, 0, 0, 0, 0, 