In [1]:
import os
import re
import gensim
import pickle
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
#分别对summary和description建立词典

In [2]:
output_dir = 'LDA-Input'
stop_words_path = 'dataset/english'

In [3]:
def remove_stop_words(inputs, stop_words_path):
    '''
    去除停用词
    :param inputs: [word1 word2...]
    :param stop_words_path:
    :return:
    '''
    with open(stop_words_path, "r", encoding="utf-8") as fr:
        stop_words = [line.strip() for line in fr.readlines()]

    outputs = [word for word in inputs if word not in stop_words]
    return outputs

In [4]:
def stem_words(inputs):
    '''
    词干化处理
    :param inputs: [word1 word2...]
    :return:
    '''
    stemmer = SnowballStemmer('english')
    # stemmer = PorterStemmer()
    outputs = [stemmer.stem(word) for word in inputs]
    return outputs

In [69]:
def remove_punctuation(input):
    punctuations = ['!', '"', '#', '$', "'", 
                    '%', '&', '(', ')', '*', 
                    '+', ',', '-', '.', '/', 
                    ':', ';', '<', '=', '>', 
                    '?', '@', '[', '\\', ']',
                    '^', '_', '`', '{', '|', 
                    '}', '~', "``", "--", "...",
                   "''", ]
    output = [word for word in input if word not in punctuations]
    
    pattern_1 = re.compile("^[=/*\\.\s]+$")
    for i in range(len(output)):
        if pattern_1.match(output[i]) is not None:
            output[i] = "<PUNCTUTATION>"
            continue
    output = [word for word in output if word!="<PUNCTUTATION>"]
    return output

In [80]:
def preprocess_project(project_name):

    project_dir = output_dir + '\\' + project_name
    if not os.path.exists(project_dir):
        os.makedirs(project_dir)

    #读取pkl文件
    input_file = open("dataset/" + project_name + ".pkl", "rb")
    info = pickle.load(input_file)
    input_file.close()

    #处理数据
    key = []
    key_id = []
    summary = []
    description = []
    priority = []
    for i in range(len(info)):
        key.append(info[i]['key']) #'HTTPCLIENT-569'
        cur_index = info[i]['key'].find("-") #10
        key_id.append(int(info[i]['key'][cur_index + 1:])) #569
        summary.append(info[i]['fields']['summary'])
        description.append(info[i]['fields']['description'])
        priority.append(info[i]['fields']['priority']['id'])
    key = np.array(key)
    key_id = np.array(key_id)
    summary = np.array(summary)
    description = np.array(description)
    priority = np.array(priority)
    """
    key: 软件缺陷报告的key
    key_id：软件缺陷报告的key_id
    summary：软件缺陷报告的摘要
    description：件缺陷报告的描述
    priority：软件缺陷报告的优先级
    """

    """将所有元素按照key_id升序进行排序"""
    indicies = np.argsort(key_id) #元素从小到大排列后提取索引
    key = key[indicies]
    summary = summary[indicies]
    description = description[indicies]
    priority = priority[indicies]

    #将summary、description分别分词、去停用词
    summary_tokenize = []
    description_tokenize = []
    
    
    
    
    
    for i in range(len(summary)):
        cur_summary = summary[i]
        cur_description = description[i]
        """处理摘要"""
        summary_words = list(word_tokenize(cur_summary)) # 分词
        summary_words = [word.lower() for word in summary_words] # 转换为小写
        summary_words = remove_stop_words(summary_words, stop_words_path) # 去除停用词
        summary_words = stem_words(summary_words) # 词干化
        summary_words = remove_punctuation(summary_words) # 去除标点
        try:
            """处理描述"""
            description_words = list(word_tokenize(cur_description)) # 分词
            description_words = [word.lower() for word in description_words] # 转换为小写
            description_words = remove_stop_words(description_words, stop_words_path) # 去除停用词
            description_words = stem_words(description_words) # 词干化
            description_words = remove_punctuation(description_words) # 去除标点
        except Exception:
            description_words = []
        """将处理后的摘要和描述放入切分词列表"""
        summary_tokenize.append(summary_words)
        description_tokenize.append(description_words)
        
        
    summary_processed = np.array(summary_tokenize)
    description_processed = np.array(description_tokenize)

#     print(summary_processed)
#     print(description_processed)
    print(len(summary_processed))
    print(len(description_processed))
    all_report = []
    for i in range(len(summary_processed)):
        if i==0: print(summary_processed[i])
        summary_processed[i].extend(description_processed[i])
        all_report.append(summary_processed[i])
        if i==0: print(summary_processed[i])
    
    print(len(all_report))
    train_valid_report = all_report[:int(len(all_report) * 0.9)]
    test_report = all_report[int(len(all_report) * 0.9):]
    
 
    fp = open('LDA-Input\\lucene-train.txt','w+',encoding='utf-8')
    fp.write(str(len(train_valid_report))+'\n')
    for i in range(len(train_valid_report)):
        fp.write(" ".join(train_valid_report[i]))
        if i != len(train_valid_report)-1: 
            fp.write("\n") 
    fp.close()
    
    fp = open('LDA-Input\\lucene-test.txt','w+',encoding='utf-8')
    fp.write(str(len(test_report))+'\n')
    for i in range(len(test_report)):
        fp.write(" ".join(test_report[i]))
        if i != len(test_report)-1: 
            fp.write("\n") 
    fp.close()

#     #读取label并整理 classified为bug的值为1，否则为0
#     info = pd.read_csv("dataset/" + project_name + "_classification_vs_type.csv")
#     label = list((info['CLASSIFIED'] == "BUG").astype(int))
#     label = np.array(label)
#     label = label[indicies]

#     word2index_summary = {} # word: index
#     word2index_description = {} # 单词到索引的映射
    
#     index_label = [] #[key,[summary_index...],[description_index...],label]
#     index_summary = 1  # 0 used for padding
#     index_description = 1
#     for i in range(len(summary_processed)):
#         summary_index = []
#         description_index = []

#         for word in summary_processed[i]:
#             if word not in word2index_summary:
#                 word2index_summary[word] = index_summary
#                 summary_index.append(index_summary)
#                 index_summary += 1
#             else:
#                 summary_index.append(word2index_summary[word])

#         for word in description_processed[i]:
#             if word not in word2index_description:
#                 word2index_description[word] = index_description
#                 description_index.append(index_description)
#                 index_description += 1
#             else:
#                 description_index.append(word2index_description[word])

#         index_label.append([key[i], summary_index, description_index, priority[i], label[i]])
#     for key in word2index_description:
#         print(key,":", word2index_description[key])

#     print(len(word2index_summary))
#     print(len(word2index_description))
#     print(len(index_label))
    #将word2index、index_label写入文件
#     pickle.dump(word2index_summary, open(project_dir + "/word2index_summary.pkl", 'wb'))
#     pickle.dump(word2index_description, open(project_dir + "/word2index_description.pkl", 'wb'))
#     pickle.dump(index_label, open(project_dir + "/index_label.pkl", 'wb'))



In [81]:
if __name__ == '__main__':
    preprocess_project('lucene')

2443
2443
['document.field', 'return', 'store', 'field']
['document.field', 'return', 'store', 'field', 'document.field', 'return', 'store', 'field', 'index', 'store', 'confus', "'s", 'isstor', 'method', "n't", 'make', 'much', 'sens', 'actual', 'field', 'return', 'field', 'document.add', 'new', 'field', 'even', 'one', 'store', 'sound', 'confus', 'll', 'attach', 'small', 'program', 'demonstr', 'either', 'fix', 'field', 'alway', 'return', 'document']
2443
