In [1]:
import json
import pickle

import numpy as np
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
import pandas as pd
from datasets import DATASET

Dataset(name='codec', root=WindowsPath('d:/IRProj/bug-localization-master/master/../data/CODEC'), src=WindowsPath('d:/IRProj/bug-localization-master/master/../data/CODEC/gitrepo'), bug_repo=WindowsPath('d:/IRProj/bug-localization-master/master/../data/CODEC/bugrepo/repository.xml'))


In [2]:
#choosing the appropriate feature set in the pipeline
class FeatureSelector(BaseEstimator, TransformerMixin):
    

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        if self.key == 'summary':
            arr = []
            for r in data:
                arr.append(' '.join(r.summary['stemmed']))
            return arr

        elif self.key == 'postagged':
            arr = []
            for r in data:
                arr.append(' '.join(
                    r.pos_tagged_summary['stemmed'] + r.pos_tagged_description['stemmed']))
            return arr

# multi-label classification using multinomial naive bayes
def multilabel_clf(train_set, test_set, src_keys):
    

    if len(train_set) <= 1:
        return [0] * len(src_keys)

    train_fixed = []
    for i in train_set:
        train_fixed.append(i.fixed_files)

    # Classes need to be binarized for the classifier
    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform(train_fixed)

    classifier = Pipeline([
        ('feats', FeatureUnion([
            ('summ', Pipeline([
                ('summary', FeatureSelector('summary')),
                ('summ_tfidf', TfidfVectorizer(sublinear_tf=True, lowercase=False))
            ])),
            ('summ_desc', Pipeline([
                ('postagged', FeatureSelector('postagged')),
                ('summ_desc_tfidf', TfidfVectorizer(
                    sublinear_tf=True, lowercase=False))
            ])),
        ])),
        ('clf', OneVsRestClassifier(MultinomialNB()))
    ])

    classifier.fit(train_set, train_labels)

    # Getting probabilities for all source files
    probas = classifier.predict_proba(test_set)

    labeled_proba = dict(zip(mlb.classes_, probas[0]))

    src_probas = []
    for name in src_keys:
        src_probas.append(labeled_proba.get(name, 0))

    return src_probas


In [3]:
# preparing the train and test sets based on previously fixed bugs
def prepare_clf(bug_reports):
    

    with open(DATASET.root / 'preprocessed_src.pickle', 'rb') as file:
        src_files = pickle.load(file)

    bug_reports = list(bug_reports.values())
    # print(bug_reports)

    min_max_scaler = MinMaxScaler()

    probabilities = []
    for i, report in enumerate(bug_reports):
        probas = multilabel_clf(bug_reports[:i], [report], src_files.keys())

        some = []
        for count in probas:
            some.append(float(count))

        probas = np.array(some).reshape(-1, 1)

        normalized_probas = np.concatenate(min_max_scaler.fit_transform(probas))
        
        probabilities.append(normalized_probas.tolist())

    return probabilities



In [4]:
listoflist = []

In [5]:

def main():

    with open(DATASET.root / 'preprocessed_reports.pickle', 'rb') as file:
        bug_reports = pickle.load(file)

    probabilities = prepare_clf(bug_reports)
    for i in probabilities:
        listoflist.append(i)
    # print(probabilities)

    with open(DATASET.root / 'fixed_bug_reports.json', 'w') as file:
        json.dump(probabilities, file)


In [6]:
main()

In [7]:
path = os.getcwd()
path+="\\table.csv"
# path

In [8]:
mrr = []
mean_avgp = []
df = pd.read_csv(path)

In [9]:
for the in listoflist:
    another = []
    for i,j in enumerate(the):
        another.append((i+1,j))
    another = sorted(another, key = lambda x: x[1],reverse=True)
    # print(another)
    relevent_ranks = []
    for rank,score in another:
        if score!=0:
            relevent_ranks.append(rank)
    relevent_ranks = sorted(relevent_ranks)
    # print(relevent_ranks)
    if len(relevent_ranks)==0:
        mean_avgp.append(0)
    else:
        for j, rank in enumerate(relevent_ranks):
            # print(j,rank)
            l = len(relevent_ranks[:j + 1])
            if rank!=0:
                t = l / rank
                mean_avgp.append(np.mean(t))
            else:
                mean_avgp.append(0)
    if len(relevent_ranks):
        position = (relevent_ranks[0])
        
        if position:
            mrr.append(1/position)
        else:
            mrr.append(0)
    else:
        mrr.append(0)
# print(np.mean(mrr))
# print(np.mean(mean_avgp))

some = ['Fixed_bug_reports_score',DATASET.name,np.mean(mrr),np.mean(mean_avgp)]
# df.loc[len(df)] = some

In [10]:
# # df.drop_duplicates(subset = "Dataset" ,keep = "first", inplace = True)
# df.to_csv(path,index=None)
# df

In [11]:
print("top 10 retrieved files:")
another[0:10]

top 10 retrieved files:


[(69, 1.0),
 (15, 0.8846477327502872),
 (16, 0.31933867909554614),
 (72, 0.2829622017106717),
 (71, 0.267583431590545),
 (26, 0.17662314845338462),
 (12, 0.1705708315829426),
 (20, 0.15321705059345345),
 (17, 0.14837180456720125),
 (19, 0.13213286309189892)]