In [1]:
import json
import pickle

import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import DATASET

Dataset(name='codec', root=WindowsPath('d:/IRProj/bug-localization-master/master/../data/CODEC'), src=WindowsPath('d:/IRProj/bug-localization-master/master/../data/CODEC/gitrepo'), bug_repo=WindowsPath('d:/IRProj/bug-localization-master/master/../data/CODEC/bugrepo/repository.xml'))


In [2]:

class Similarity:

    __slots__ = ['src_files', 'src_strings']

    def __init__(self, src_files):
        self.src_files = src_files
        self.src_strings = [' '.join(src.file_name['stemmed'] + src.class_names['stemmed']
                                     + src.method_names['stemmed']
                                     + src.pos_tagged_comments['stemmed']
                                     + src.attributes['stemmed'])
                            for src in self.src_files.values()]

    def calculate_similarity(self, src_tfidf, reports_tfidf):
        """Calculating cosine similarity between source files and bug reports"""

        # Normalizing the length of source files
        src_lengths = np.array([float(len(src_str.split()))
                                for src_str in self.src_strings]).reshape(-1, 1)
        min_max_scaler = preprocessing.MinMaxScaler()
        normalized_src_len = min_max_scaler.fit_transform(src_lengths)

        # Applying logistic length function
        src_len_score = 1 / (1 + np.exp(-12 * normalized_src_len))

        simis = []
        for report in reports_tfidf:
            s = cosine_similarity(src_tfidf, report)

            # revised VSM score calculation
            rvsm_score = s * src_len_score

            normalized_score = np.concatenate(
                min_max_scaler.fit_transform(rvsm_score)
            )

            simis.append(normalized_score.tolist())

        return simis

    def find_similars(self, bug_reports):
        """Calculating tf-idf vectors for source and report sets
        to find similar source files for each bug report.
        """

        reports_strings = [' '.join(report.summary['stemmed'] + report.description['stemmed'])
                           for report in bug_reports.values()]

        tfidf = TfidfVectorizer(sublinear_tf=True, smooth_idf=False)
        src_tfidf = tfidf.fit_transform(self.src_strings)

        reports_tfidf = tfidf.transform(reports_strings)

        simis = self.calculate_similarity(src_tfidf, reports_tfidf)
        return simis

In [3]:
listOfList = []

In [5]:
def main():

    # Unpickle preprocessed data
    with open(DATASET.root 
    / 'preprocessed_src.pickle', 'rb') as file:
        src_files = pickle.load(file)
    with open(DATASET.root / 'preprocessed_reports.pickle', 'rb') as file:
        bug_reports = pickle.load(file)

    sm = Similarity(src_files)
    simis = sm.find_similars(bug_reports)
    for i in simis:
        listOfList.append(i)
    print(len(simis))

    # Saving similarities in a json file
    with open(DATASET.root / 'vsm_similarity.json', 'w') as file:
        json.dump(simis, file)


In [6]:
main()

42


In [None]:
asdf

In [2]:
# path = os.getcwd()
# path = path.replace('\\buglocalizer','')
# path+="\\table.csv"
# path

'd:\\IRProj\\bug-localization-master\\master\\table.csv'

In [8]:
mrr = []
mean_avgp = []
# df = pd.read_csv(path)

In [14]:
for the in listOfList:
    another = []
    for i,j in enumerate(the):
        another.append((i+1,j))
    # print(another)
    another = sorted(another, key = lambda x: x[1],reverse=True)
    # print(another)
    relevent_ranks = []
    for rank,score in another:
        if score!=0:
            relevent_ranks.append(rank)
    # relevent_ranks = sorted(relevent_ranks)
    # print(relevent_ranks)
    if len(relevent_ranks)==0:
        mean_avgp.append(0)
    else:
        someAnother = sorted(relevent_ranks)
        # print(someAnother)
        for j, rank in enumerate(someAnother):
            # print(j,rank)
            l = len(someAnother[:j + 1])
            if rank!=0:
                t = l / rank
                mean_avgp.append(np.mean(t))
            else:
                mean_avgp.append(0)
    # print(relevent_ranks)
    if len(relevent_ranks):
        position = (relevent_ranks[0])
        if position:
            mrr.append(1/position)
        else:
            mrr.append(0)
    else:
        mrr.append(0)
# print(mrr)
# print(np.mean(mrr))

# print(np.mean(mean_avgp))

some = ['VSM_similarity_score',DATASET.name,np.mean(mrr),np.mean(mean_avgp)]
# df.loc[len(df)] = some

In [None]:
# df.drop_duplicates(subset = "Dataset" ,keep = "first", inplace = True)
# df.to_csv(path,index=None)
# df

In [18]:
print("top 10 retrieved files:")
another[0:10]

top 10 retrieved files:


[(100, 1.0),
 (13, 0.8319460700351919),
 (16, 0.795660965076778),
 (49, 0.7040062788441495),
 (72, 0.6534156001092718),
 (44, 0.6400586654702703),
 (69, 0.6391384860174552),
 (14, 0.5856902418212637),
 (67, 0.5736021304115166),
 (17, 0.5635557689349642)]