In [8]:
# Running it only for one specific bug and test file to check the behaviour
bug_report_file = "/Users/lorenapacheco/Concordia/Masters/BugReportsMining/bug-reports/commons-cli/commons-cli-CLI-133.json"
defects4j_id = "Cli-5"
buggy_commit = "2b0a94aee899d9e7d855c402ad40eb4e318f46e7"
repo_folder = "/Users/lorenapacheco/Concordia/Masters/open_source_repos_being_studied/commons-cli"
test_file = "/Users/lorenapacheco/Concordia/Masters/open_source_repos_being_studied/commons-cli/src/test/org/apache/commons/cli/ValuesTest.java"

## NLKT

In [28]:
import subprocess
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import *

def preprocess(file):
    return subprocess.check_output(['java', '-jar', '../lib/preprocessing.jar', file]).decode("utf-8")

def stop_words_removal(voc):
    stop_words = set(stopwords.words('english'))
    tokens = [w.lower() for w in voc if not w.lower() in stop_words and not is_symbol(w)]
    return tokens

def is_symbol(word):
    return re.search('\w+', word) == None

def load_java_keywords():
    file = os.path.join(os.path.abspath(''), '/Users/lorenapacheco/Concordia/Masters/BugReportsMining/textualSimilarity/java_keywords.txt')
    f = open(file, "r")
    if f.mode == "r":
        contents = f.read()
        return contents.split('\n')
    return None

def java_keywords_removal(voc):
    java_keywords = load_java_keywords()
    if java_keywords == None:
        print('java_keywords.txt unavailable')
        return
    return [w for w in voc if not w in java_keywords]

def stemming(voc):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in voc]

def tokenize(source):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(source)

def code_to_corpus(source):
    package = source.splitlines()[-1]
    voc = tokenize(source.replace('_', ' '))
    voc = camel_case_split(voc)
    voc = java_keywords_removal(voc)
    voc = stop_words_removal(voc)
    voc = [w for w in voc if not w.isdigit()]
    voc = stemming(voc)
    return voc, package

def camel_case_split(voc):
    new_voc = []
    for word in voc:
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
        new_voc.extend([m.group(0) for m in matches])
    return new_voc

def bug_to_corpus(content):
    voc = content.split(' ')
    voc = tokenize(content.replace('_', ' '))
    voc = camel_case_split(voc)
    voc = java_keywords_removal(voc)
    voc = stop_words_removal(voc)
    voc = [w for w in voc if not w.isdigit()]
    voc = stemming(voc)
    return voc

## SKlearn

In [35]:

import numpy as np

from itertools import islice
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def take(n, iterable):
    # Return first n items of the iterable as a list
    l = list(islice(iterable, n))
    new_dict = {}
    for item in l:
        new_dict[item[0]] = item[1]
    return new_dict

def sort_scores(docs_dic, sim_scores):
    n = len(docs_dic)
    sort_a = sim_scores.argsort()[-n:][::-1]
    return sort_a

def get_cosine_sim(docs):
    vectors = [t for t in get_vectors(docs)]
    return cosine_similarity(vectors)

def get_vectors(text):
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

def compute_rvsm(chunk_size, bug_voc, code_voc):
    bug_str = ' '.join(bug_voc)
    docs = [bug_str]
    docs_dic = {0: 'self'}
    for corpus in code_voc:
        if len(corpus.get_voc()) >= chunk_size:
            voc = corpus.get_voc()
            while len(voc) >= chunk_size:
                text = ' '.join(voc[0:chunk_size+1])
                voc = voc[chunk_size+1:]
                docs.append(text)
                docs_dic[len(docs_dic)] = corpus.get_file()
        else:
            text = ' '.join(corpus.get_voc())
            docs.append(text)
            docs_dic[len(docs_dic)] = corpus.get_file()

    sim_scores = get_cosine_sim(docs)[0]
    sorted_index = sort_scores(docs_dic, sim_scores)

    rvsm_rank = {}
    for i in sorted_index:
        if docs_dic[i] in rvsm_rank:
            rvsm_rank[docs_dic[i]] = sim_scores[i] if sim_scores[i] > rvsm_rank[docs_dic[i]] else rvsm_rank[docs_dic[i]]
            continue
        if i == 0:
            continue
        rvsm_rank[docs_dic[i]] = sim_scores[i]

    return rvsm_rank

## Testing for the bug CLI-5

In [41]:
import json
import os

def json_file_to_dict(file):
    data = {}
    with open(file, 'r') as fp:
        data = json.load(fp)
    fp.close()
    return data

def read_file(file):
    file_content = ""
    with open(test_file, 'r') as file:
        file_content = file.read()
    file.close()
    return file_content

# Getting the bug report textual content
bug_report_content = json_file_to_dict(bug_report_file)
title = bug_report_content['title'] if bug_report_content['title'] else ""
description = bug_report_content['body'] if bug_report_content['body'] else ""
bug_text_content = title + '\n' + description + '\n' + '\n'.join([comment['body'] for comment in bug_report_content['comments_content'] if 'body' in comment and comment['body']])
bug_corpus = bug_to_corpus(bug_text_content)

# Getting the test textual content
os.chdir(repo_folder)
checkout_command = "git checkout  --quiet " + buggy_commit
os.system(checkout_command)
test_file_content = read_file(test_file)
code_corpus = code_to_corpus(test_file_content)

compute_rvsm(800, bug_corpus, code_corpus) # segment size = 800 tokens in Pathidea

AttributeError: 'list' object has no attribute 'get_voc'