In [1]:
#Importing libraries

import pickle
import re
import string
import inflection
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from datasets import DATASET
from parsers import Parser
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Dataset(name='codec', root=WindowsPath('d:/IRProj/bug-localization-master/buglocalizer/../data/CODEC'), src=WindowsPath('d:/IRProj/bug-localization-master/buglocalizer/../data/CODEC/gitrepo'), bug_repo=WindowsPath('d:/IRProj/bug-localization-master/buglocalizer/../data/CODEC/bugrepo/repository.xml'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
#English stopwords

stop_words = set(stopwords.words('english'))

#keywords in java

java_keywords = set(['abstract', 'assert', 'boolean', 'break', 'byte', 'case',
     'catch', 'char', 'class', 'const', 'continue', 'default', 'do', 'double',
     'else', 'enum', 'extends', 'false', 'final', 'finally', 'float', 'for', 'goto',
     'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long',
     'native', 'new', 'null', 'package', 'private', 'protected', 'public', 'return',
     'short', 'static', 'strictfp', 'super', 'switch', 'synchronized', 'this',
     'throw', 'throws', 'transient', 'true', 'try', 'void', 'volatile', 'while'])


In [3]:
#Class for Preprocessing bug reports

class ReportPreprocessing:
    
    __slots__ = ['bug_reports']

    def __init__(self, bug_reports):
        
        self.bug_reports = bug_reports
    
    # Function to extract stack traces from bug reports
    
    def stack_traces_extract(self):

        #Pattern for retrieving stack traces
        
        pattern = re.compile(r' at (.*?)\((.*?)\)')

        # Signs of a true stack trace for checking the fetched pattern
        
        signs = [ 'Unknown Source','.java', 'Native Method']
        
        for report in self.bug_reports.values():
            
            candidate_for_st = re.findall(pattern, report.description)

            #finding the actual stack trace out of all the possible candidates
            st = [x for x in candidate_for_st if any(s in x[1] for s in signs)]
            
            report.stack_traces = st
            
    #Function for tokenizing bug reports
    
    def tokenize(self):
        
        for report in self.bug_reports.values():
            report.summary = nltk.wordpunct_tokenize(report.summary)
            report.description = nltk.wordpunct_tokenize(report.description)
            
     
    #Function extracting specific pos tags from bug report's summary and description
    
    def pos_tagging(self):

        for report in self.bug_reports.values():
            
            ps=['NN','VB']

            # Tokenization and pos-tagging in report summary
            
            summ_token = nltk.word_tokenize(report.summary)
            summ_pos = nltk.pos_tag(summ_token)
            report.pos_tagged_summary = [token for token, pos in summ_pos if pos in ps]
            
            # Tokenization and pos-tagging in report description
            
            desc_token = nltk.word_tokenize(report.description)
            desc_pos = nltk.pos_tag(desc_token)
            report.pos_tagged_description = [token for token, pos in desc_pos if pos in ps]
                
     
    #Function for splitting camelcase
    def _split_camelcase(self, tokens):

        returning_tokens = tokens

        for token in tokens:
            
            split_tokens = re.split(r'[{string.punctuation}]+', token)
            
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                
                # Camel case detection for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens += camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens += camel_split

        return returning_tokens
     
    #Function for applying __split__camelcase
    
    def split_camelcase_apply(self):

        for report in self.bug_reports.values():
            
            report.summary = self._split_camelcase(report.summary)
            report.description = self._split_camelcase(report.description)
            report.pos_tagged_summary = self._split_camelcase(report.pos_tagged_summary)
            report.pos_tagged_description = self._split_camelcase(report.pos_tagged_description)
    
    #Function for removing punctuation, numbers and converting text into lowercase
    
    def clean(self):
       

        # Building a translate table for punctuation and number removal
        
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})

        for report in self.bug_reports.values():
            summary_punctnum_rem = [token.translate(punctnum_table) for token in report.summary]
            desc_punctnum_rem = [token.translate(punctnum_table) for token in report.description]
            pos_sum_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_summary]
            pos_desc_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_description]
            report.summary = [token.lower() for token in summary_punctnum_rem if token]
            report.description = [token.lower() for token in desc_punctnum_rem if token]
            report.pos_tagged_summary = [token.lower() for token in pos_sum_punctnum_rem if token]
            report.pos_tagged_description = [token.lower() for token in pos_desc_punctnum_rem if token]

    #Function for removing stopwords and javakeywords
    
    def remove_stopwords_keywords(self):
        
                    
        for report in self.bug_reports.values():
            
            #Removal in Report Summary
            report.summary = [token for token in report.summary if token not in stop_words]
            report.summary = [token for token in report.summary if token not in java_keywords]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in stop_words]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in java_keywords]
            
            #Removal in Report Description 
            report.description = [token for token in report.description if token not in stop_words]
            report.description = [token for token in report.description if token not in java_keywords]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in stop_words]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in java_keywords]
   
    #Function for performing the stemming in tokens using porter stemmer
    
    def stem(self):
        stemmer = PorterStemmer()

        for report in self.bug_reports.values():
            report.summary = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in report.summary],report.summary]))

            report.description = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in report.description],report.description]))

            report.pos_tagged_summary = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in report.pos_tagged_summary],report.pos_tagged_summary]))

            report.pos_tagged_description = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in report.pos_tagged_description],report.pos_tagged_description]))

    #Function that calls all the function above for doing complete preprocessing
    
    def preprocess(self):

        self.stack_traces_extract()
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase_apply()
        self.clean()
        self.remove_stopwords_keywords()
        self.stem()

In [4]:
ps=['NN','VB']

In [5]:
#Class for preprocessing source code

class SrcPreprocessing:
    
    
    
    __slots__ = ['src_files']

    def __init__(self, src_files):
        self.src_files = src_files
    
    #Function for extracting specific pos tags
    
    def pos_tagging(self):
        
        for src in self.src_files.values():

            # Tokenizing word and doing pos tagging
            
            comments_tok = nltk.word_tokenize(src.comments)
            comments_pos = nltk.pos_tag(comments_tok)

            src.pos_tagged_comments = [token for token, pos in comments_pos if pos in ps]
            
    #Function for tokenizing source code
    
    def tokenize(self):

        for src in self.src_files.values():
            src.all_content = nltk.wordpunct_tokenize(src.all_content)
            src.comments = nltk.wordpunct_tokenize(src.comments)
            # print(src.all_content,src.comments)
     
    #Function for splitting camelcase
    
    def _split_camelcase(self, tokens):

        # Copy tokens
        returning_tokens = tokens[:]

        for token in tokens:
            split_tokens = re.split(fr'[{string.punctuation}]+', token)

            # If token is split into some other tokens
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                # Camel case detection for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens += camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens += camel_split
        # print(returning_tokens)

        return returning_tokens
    
    #Function for applying __split__camelcase
    
    def split_camelcase_apply(self):

        for src in self.src_files.values():
            src.all_content = self._split_camelcase(src.all_content)
            src.comments = self._split_camelcase(src.comments)
            src.class_names = self._split_camelcase(src.class_names)
            src.attributes = self._split_camelcase(src.attributes)
            src.method_names = self._split_camelcase(src.method_names)
            src.variables = self._split_camelcase(src.variables)
            src.file_name = self._split_camelcase(src.file_name)
            src.pos_tagged_comments = self._split_camelcase(src.pos_tagged_comments)


     #Function for removing punctuation, numbers and converting text into lowercase
    
    def clean(self):
        
        # Building a translate table for punctuation and number removal
        
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})

        for src in self.src_files.values():
            content_punctnum_rem = [token.translate(punctnum_table) for token in src.all_content]
            comments_punctnum_rem = [token.translate(punctnum_table) for token in src.comments]
            classnames_punctnum_rem = [token.translate(punctnum_table) for token in src.class_names]
            attributes_punctnum_rem = [token.translate(punctnum_table) for token in src.attributes]
            methodnames_punctnum_rem = [token.translate(punctnum_table) for token in src.method_names]
            variables_punctnum_rem = [token.translate(punctnum_table) for token in src.variables]
            filename_punctnum_rem = [token.translate(punctnum_table) for token in src.file_name]
            pos_comments_punctnum_rem = [token.translate(punctnum_table) for token in src.pos_tagged_comments]
            src.all_content = [token.lower() for token in content_punctnum_rem if token]
            src.comments = [token.lower() for token in comments_punctnum_rem if token]
            src.class_names = [token.lower() for token in classnames_punctnum_rem if token]
            src.attributes = [token.lower() for token in attributes_punctnum_rem if token]
            src.method_names = [token.lower() for token in methodnames_punctnum_rem if token]
            src.variables = [token.lower() for token in variables_punctnum_rem if token]
            src.file_name = [token.lower() for token in filename_punctnum_rem if token]
            src.pos_tagged_comments = [token.lower() for token in pos_comments_punctnum_rem if token]
     
    #Function for removing stopwords and javakeywords
    
    def remove_stopwords_keywords(self):
        
        for src in self.src_files.values():
            src.all_content = [token for token in src.all_content if token not in stop_words]
            src.all_content = [token for token in src.all_content if token not in java_keywords]
            src.comments = [token for token in src.comments if token not in stop_words]
            src.comments = [token for token in src.comments if token not in java_keywords]
            src.class_names = [token for token in src.class_names if token not in stop_words]
            src.class_names = [token for token in src.class_names if token not in java_keywords]
            src.attributes = [token for token in src.attributes if token not in stop_words]
            src.attributes = [token for token in src.attributes if token not in java_keywords]
            src.method_names = [token for token in src.method_names if token not in stop_words]
            src.method_names = [token for token in src.method_names if token not in java_keywords]
            src.variables = [token for token in src.variables if token not in stop_words]
            src.variables = [token for token in src.variables if token not in java_keywords]
            src.file_name = [token for token in src.file_name if token not in stop_words]
            src.file_name = [token for token in src.file_name if token not in java_keywords]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in stop_words]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in java_keywords]
            
    #Function for performing the stemming in tokens using porter stemmer
    def stem(self):
        
        stemmer = PorterStemmer()

        for src in self.src_files.values():
            src.all_content = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in src.all_content],src.all_content]))

            src.comments = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in src.comments],src.comments]))

            src.class_names = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in src.class_names], src.class_names]))

            src.attributes = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.attributes],src.attributes]))

            src.method_names = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in src.method_names],src.method_names]))

            src.variables = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in src.variables],src.variables]))

            src.file_name = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in src.file_name],src.file_name]))

            src.pos_tagged_comments = dict(zip(['stemmed', 'unstemmed'],[[stemmer.stem(token) for token in src.pos_tagged_comments],src.pos_tagged_comments]))

     #Function that calls all the function above for doing complete preprocessing
    
    def preprocess(self):
        
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase_apply()
        self.clean()
        self.remove_stopwords_keywords()
        self.stem()

In [6]:
def main():

    parser = Parser(DATASET)
    src_prep = SrcPreprocessing(parser.src_parser())
    src_prep.preprocess()
    with open(DATASET.root / 'preprocessed_src.pickle', 'wb') as file:
        pickle.dump(src_prep.src_files, file, protocol=pickle.HIGHEST_PROTOCOL)
    print(src_prep.src_files)
    report_prep = ReportPreprocessing(parser.report_parser())
    report_prep.preprocess()
    with open(DATASET.root / 'preprocessed_reports.pickle', 'wb') as file:
        pickle.dump(report_prep.bug_reports, file,
                    protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
main()

OrderedDict([('org.apache.commons.codec.BinaryDecoder.java', <parsers.SourceFile object at 0x000002D46A6A46D0>), ('org.apache.commons.codec.BinaryEncoder.java', <parsers.SourceFile object at 0x000002D46A6A4660>), ('org.apache.commons.codec.CharEncoding.java', <parsers.SourceFile object at 0x000002D46A6A4740>), ('org.apache.commons.codec.Charsets.java', <parsers.SourceFile object at 0x000002D46A6A47B0>), ('org.apache.commons.codec.Decoder.java', <parsers.SourceFile object at 0x000002D46A6A4820>), ('org.apache.commons.codec.DecoderException.java', <parsers.SourceFile object at 0x000002D46A6A4BA0>), ('org.apache.commons.codec.Encoder.java', <parsers.SourceFile object at 0x000002D46A6A4C80>), ('org.apache.commons.codec.EncoderException.java', <parsers.SourceFile object at 0x000002D46A6A4890>), ('org.apache.commons.codec.StringDecoder.java', <parsers.SourceFile object at 0x000002D46A6A4900>), ('org.apache.commons.codec.StringEncoder.java', <parsers.SourceFile object at 0x000002D46A6A4970>),