# Introduction

In this notebook we demonstrate the use of **Word Embeddings (Word2Vec)** weighting technique into Information Retrieval to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made

## Import Libraries

In [9]:
import pandas as pd
import numpy as np
import spacy

from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, pairwise_distances, pairwise
from sklearn.externals.joblib import Parallel, delayed

from utils import plots
from utils import firefox_dataset as fd
from utils import tokenizers as tok
from utils import aux_functions
from utils import model_evaluator as m_eval

from models.wordvec import WordVec_BasedModel
from models.model_hyperps import WordVec_Model_Hyperp

import warnings; warnings.simplefilter('ignore')

## Load Dataset

In [10]:
test_cases_df = fd.read_testcases_df()
bug_reports_df = fd.read_bugreports_df()

corpus = test_cases_df.tc_desc
query = bug_reports_df.br_desc

test_cases_names = test_cases_df.tc_name
bug_reports_names = bug_reports_df.br_name

orc = fd.read_trace_df()
orc.set_index('tc_name', inplace=True, drop=True)

print(bug_reports_df.shape)
print(test_cases_df.shape)
print(orc.shape)

(35314, 12)
(207, 10)
(207, 35314)


### Select Subset

In [6]:
bugreports_subset_df = bug_reports_df[(bug_reports_df.Version == '48 Branch') | (bug_reports_df.Version == '60 Branch')].sample(15, random_state=42)
testcases_subset_df = test_cases_df[(test_cases_df.TestDay.str.contains('20161014')) | (test_cases_df.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

selected_testcases = ['TC_{}_TRG'.format(tc_num) for tc_num in [13, 14, 15, 16, 17, 18]]  # should link with 48 Branch
aux_tc = test_cases_df[test_cases_df.tc_name.isin(selected_testcases)]

tc_subset_df = testcases_subset_df.append(aux_tc)
tc_subset_df.drop_duplicates(inplace=True)

corpus_subset = tc_subset_df.tc_desc
query_subset = bugreports_subset_df.br_desc
testcases_names_subset = tc_subset_df.tc_name
bug_reports_names_subset = bugreports_subset_df.br_name
orc_subset_df = orc.loc[testcases_names_subset, bug_reports_names_subset]

print('TestCases Subset Shape: {}'.format(tc_subset_df.shape))
print('BugReports Subset Shape: {}'.format(bugreports_subset_df.shape))
print('Oracle Subset Shape: {}'.format(orc_subset_df.shape))

TestCases Subset Shape: (14, 10)
BugReports Subset Shape: (15, 12)
Oracle Subset Shape: (14, 15)


## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

### Analysis with Default Values of WordVec Model

In [6]:
best_model = WordVec_BasedModel()
best_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
evaluator = m_eval.ModelEvaluator(orc.oracle, best_model)
evaluator.evaluate_model(verbose=True)

{'Measures': {'Mean FScore of WordVec': 0.2714285714285714,
              'Mean Precision of WordVec': 0.19047619047619047,
              'Mean Recall of WordVec': 0.5},
 'Setup': [{'Name': 'WordVec'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.8)},
           {'Top Value': 3},
           {'Tokenizer': <utils.tokenizers.WordNetBased_LemmaTokenizer object at 0x7f619f22b240>}]}


## Running WordVec_Based Model with Different Types of Oracles

### Strong and Weak Links Datasets

In [11]:
br_tc_strong_df = pd.read_csv('../../data/mozilla_firefox_v2/firefoxDataset/oracle/output/BR_TC_Strong.csv')
br_tc_weak_df = pd.read_csv('../../data/mozilla_firefox_v2/firefoxDataset/oracle/output/BR_TC_Weak.csv')
br_tc_mix_df = pd.read_csv('../../data/mozilla_firefox_v2/firefoxDataset/oracle/output/BR_TC_Mix.csv')

print(br_tc_strong_df.shape)
print(br_tc_weak_df.shape)
print(br_tc_mix_df.shape)

(10, 4)
(10, 4)
(10, 4)


### Define **run_wordvec_model()** Function

In [12]:
def run_wordvec_model(selected_tcs, selected_brs):
    tcs_df = test_cases_df[test_cases_df.tc_name.isin(selected_tcs)]
    brs_df = bug_reports_df[bug_reports_df.br_name.isin(selected_brs)]

    corpus_subset = tcs_df.tc_desc
    query_subset = brs_df.br_desc
    testcases_names_subset = tcs_df.tc_name
    bug_reports_names_subset = brs_df.br_name
    orc_subset_df = orc.loc[testcases_names_subset, bug_reports_names_subset]

    print('TestCases Subset Shape: {}'.format(tcs_df.shape))
    print('BugReports Subset Shape: {}'.format(brs_df.shape))
    print('Oracle Subset Shape: {}'.format(orc_subset_df.shape))

    wv_hyperp = {
        WordVec_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('cosine', .80),
        WordVec_Model_Hyperp.TOP.value : 5,
        WordVec_Model_Hyperp.TOKENIZER.value : tok.PorterStemmerBased_Tokenizer()
    }

    wv_model = WordVec_BasedModel(**wv_hyperp)
    wv_model.set_name('WordVec_Model_0')
    wv_model.recover_links(corpus_subset, query_subset, testcases_names_subset, bug_reports_names_subset)

    print("\nModel Evaluation -------------------------------------------")
    evaluator = m_eval.ModelEvaluator(orc_subset_df, wv_model)
    evaluator.evaluate_model(verbose=True)

    print("\n\nTraceLinks Matrix --------------------------------------")
    display(aux_functions.highlight_df(wv_model.get_trace_links_df()))

    print("\n\nOracle -----------------------------------------")
    display(aux_functions.highlight_df(orc_subset_df))

### Oracle with Strong Links Only

In [17]:
selected_tcs = ['TC_{}_TRG'.format(tc_num) for tc_num in br_tc_strong_df.TC.values]
selected_brs = ['BR_{}_SRC'.format(bg_num) for bg_num in br_tc_strong_df.BR.values]

run_wordvec_model(selected_tcs, selected_brs)

TestCases Subset Shape: (10, 10)
BugReports Subset Shape: (6, 12)
Oracle Subset Shape: (10, 6)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of WordVec_Model_0': 0.5277777777777777,
              'Mean Precision of WordVec_Model_0': 0.6,
              'Mean Recall of WordVec_Model_0': 0.4920634920634921},
 'Setup': [{'Name': 'WordVec_Model_0'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.8)},
           {'Top Value': 5},
           {'Tokenizer': <utils.tokenizers.PorterStemmerBased_Tokenizer object at 0x7fa27ed01c50>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1298575_SRC,BR_1313805_SRC,BR_1320658_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TC_118_TRG,0,0,0,0,0,0
TC_120_TRG,1,0,0,1,1,0
TC_121_TRG,1,0,0,1,1,0
TC_143_TRG,0,1,0,0,0,1
TC_155_TRG,0,1,0,0,0,1
TC_172_TRG,0,0,0,1,0,0
TC_181_TRG,0,0,0,1,0,1
TC_183_TRG,1,1,0,1,1,1
TC_196_TRG,1,1,0,0,1,1
TC_197_TRG,1,1,0,0,1,0




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1298575_SRC,BR_1313805_SRC,BR_1320658_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TC_118_TRG,1,0,0,0,0,0
TC_120_TRG,1,0,0,0,0,0
TC_121_TRG,1,0,0,0,0,0
TC_143_TRG,0,1,1,1,1,1
TC_155_TRG,0,1,1,1,1,1
TC_172_TRG,0,1,1,1,1,1
TC_181_TRG,0,1,1,1,1,1
TC_183_TRG,0,1,1,1,1,1
TC_196_TRG,0,1,1,1,1,1
TC_197_TRG,0,1,1,1,1,1


### Oracle with Weak Links Only

In [16]:
selected_tcs = ['TC_{}_TRG'.format(tc_num) for tc_num in br_tc_weak_df.TC.values]
selected_brs = ['BR_{}_SRC'.format(bg_num) for bg_num in br_tc_weak_df.BR.values]

run_wordvec_model(selected_tcs, selected_brs)

TestCases Subset Shape: (10, 10)
BugReports Subset Shape: (5, 12)
Oracle Subset Shape: (10, 5)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of WordVec_Model_0': 0.44761904761904764,
              'Mean Precision of WordVec_Model_0': 0.4800000000000001,
              'Mean Recall of WordVec_Model_0': 0.48571428571428565},
 'Setup': [{'Name': 'WordVec_Model_0'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.8)},
           {'Top Value': 5},
           {'Tokenizer': <utils.tokenizers.PorterStemmerBased_Tokenizer object at 0x7fa27ed01048>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,0,1,0,0,1
TC_75_TRG,1,1,0,1,0
TC_105_TRG,1,1,1,1,1
TC_154_TRG,0,0,0,0,0
TC_155_TRG,1,0,1,0,1
TC_174_TRG,1,0,1,1,0
TC_196_TRG,0,1,1,1,0
TC_197_TRG,1,1,1,1,0
TC_200_TRG,0,0,0,0,1
TC_207_TRG,0,0,0,0,1




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,1,0,0,0,0
TC_75_TRG,0,1,0,0,0
TC_105_TRG,0,1,0,0,0
TC_154_TRG,0,0,1,1,1
TC_155_TRG,0,0,1,1,1
TC_174_TRG,0,0,1,1,1
TC_196_TRG,0,0,1,1,1
TC_197_TRG,0,0,1,1,1
TC_200_TRG,0,0,1,1,1
TC_207_TRG,0,0,1,1,1


### Oracle with Mixed Links (Strong and Weak)

In [15]:
selected_tcs = ['TC_{}_TRG'.format(tc_num) for tc_num in br_tc_mix_df.TC.values]
selected_brs = ['BR_{}_SRC'.format(bg_num) for bg_num in br_tc_mix_df.BR.values]

run_wordvec_model(selected_tcs, selected_brs)

TestCases Subset Shape: (10, 10)
BugReports Subset Shape: (5, 12)
Oracle Subset Shape: (10, 5)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of WordVec_Model_0': 0.4045454545454545,
              'Mean Precision of WordVec_Model_0': 0.4,
              'Mean Recall of WordVec_Model_0': 0.4333333333333333},
 'Setup': [{'Name': 'WordVec_Model_0'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.8)},
           {'Top Value': 5},
           {'Tokenizer': <utils.tokenizers.PorterStemmerBased_Tokenizer object at 0x7fa29c88d4a8>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,0,1,0,0,1
TC_105_TRG,1,1,1,1,1
TC_118_TRG,1,1,1,0,0
TC_121_TRG,1,1,1,1,1
TC_143_TRG,0,0,0,0,1
TC_154_TRG,0,0,0,0,0
TC_155_TRG,0,0,0,0,1
TC_172_TRG,1,0,1,1,0
TC_174_TRG,1,0,1,1,0
TC_197_TRG,0,1,0,1,0




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,1,0,0,0,0
TC_105_TRG,0,1,0,0,0
TC_118_TRG,0,1,0,0,0
TC_121_TRG,0,1,0,0,0
TC_143_TRG,0,0,1,1,1
TC_154_TRG,0,0,1,1,1
TC_155_TRG,0,0,1,1,1
TC_172_TRG,0,0,1,1,1
TC_174_TRG,0,0,1,1,1
TC_197_TRG,0,0,1,1,1


### General Test

In [8]:
bugreports_subset_df = bug_reports_df[(bug_reports_df.Version == '48 Branch') | (bug_reports_df.Version == '60 Branch')].sample(15, random_state=42)
testcases_subset_df = test_cases_df[(test_cases_df.TestDay.str.contains('20161014')) | (test_cases_df.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

selected_testcases = ['TC_{}_TRG'.format(tc_num) for tc_num in [13, 14, 15, 16, 17, 18]]  # should link with 48 Branch
aux_tc = test_cases_df[test_cases_df.tc_name.isin(selected_testcases)]

selected_bugreports = bugreports_subset_df.br_name
run_wordvec_model(selected_testcases, selected_bugreports)


TestCases Subset Shape: (6, 10)
BugReports Subset Shape: (15, 12)
Oracle Subset Shape: (6, 15)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of WordVec_Model_0': 0.218989898989899,
              'Mean Precision of WordVec_Model_0': 0.26666666666666666,
              'Mean Recall of WordVec_Model_0': 0.1888888888888889},
 'Setup': [{'Name': 'WordVec_Model_0'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.8)},
           {'Top Value': 5},
           {'Tokenizer': <utils.tokenizers.PorterStemmerBased_Tokenizer object at 0x7fa29b7d8208>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1268934_SRC,BR_1282551_SRC,BR_1291175_SRC,BR_1299787_SRC,BR_1418983_SRC,BR_1432520_SRC,BR_1436749_SRC,BR_1443632_SRC,BR_1443754_SRC,BR_1450216_SRC,BR_1461828_SRC,BR_1463274_SRC,BR_1463735_SRC,BR_1497738_SRC,BR_1513270_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TC_13_TRG,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1
TC_14_TRG,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
TC_15_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_16_TRG,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1
TC_17_TRG,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1
TC_18_TRG,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1268934_SRC,BR_1282551_SRC,BR_1291175_SRC,BR_1299787_SRC,BR_1418983_SRC,BR_1432520_SRC,BR_1436749_SRC,BR_1443632_SRC,BR_1443754_SRC,BR_1450216_SRC,BR_1461828_SRC,BR_1463274_SRC,BR_1463735_SRC,BR_1497738_SRC,BR_1513270_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TC_13_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_14_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_15_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_16_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_17_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_18_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
