In [None]:
# default_exp experiment.mining.ir.unsupervised.w2v

# Neural Unsupervised Approaches for SE Traceability [approach d2v]

> Just Paper. Full Experimentation. This module is dedicated to evaluate word2vec/doc2vec or any neural unsupervised approaches on traceability datasets. Consider to Copy the entire notebook for a new and separeted empirical evaluation. 
> Implementing mutual information analysis
> Author: @danaderp April 2020
> Author: @danielrc Nov 2020

This copy is for Cisco purposes. It was adapted to process private github data from cisco. 

In [None]:
from ds4se.mining.ir import *

In [None]:
from prg import prg

In [None]:
import ds4se as ds

In [None]:
import numpy as np

In [None]:
#??
import gensim
import pandas as pd
from itertools import product 
from random import sample 
import functools 
import os
from enum import Enum, unique, auto

In [None]:
#export
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from pandas.plotting import lag_plot
import math as m
import random as r
import collections
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

In [None]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
#export
from scipy.spatial import distance
from scipy.stats import pearsonr

In [None]:
#export
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Artifacts Similarity with BasicSequenceVectorization

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

# Experients Set-up

In [None]:
path_data = '../dvc-ds4se/' #dataset path

In [None]:

path_to_trained_model = path_data+'models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model'
#path_to_trained_model = path_data/'models/wv/bpe128k/[word2vec-Java-Py-Wiki-SK-500-20E-128k[15]-1595189771.501188].model'

In [None]:
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'

In [None]:
#Experiment 1 with Libest Conv preprocessing
def libest_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2src,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.REQ.value,
        "target_type": SoftwareArtifacts.TC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "saving_path": path_data + 'se-benchmarking/traceability',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_model_prefix, #For BPE Analysis
        "path_mappings": '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt',    
    }

In [None]:
#Experiment 2 with Libest BPE preprocessing
def libest_params_bpe():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2src,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_type": 'req', #TODO Standardize the artifacts 
        "target_type": 'tc',
        #"path_mappings": 'cisco/libest_data/sacp-pr-mappings.csv',
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','bpe128k'],
            "prep": Preprocessing.bpe
        },
        "saving_path": path_data + 'se-benchmarking/traceability',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_data + 'models/bpe/sentencepiece/wiki_py_java_bpe_8k' #For BPE Analysis    
    }

In [None]:
#CISCO GitHub Parameters
def sacp_params():
    return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_data + 'models/wv/conv/[word2vec-Py-Java-Wiki-SK-500-20E[0]-1592979270.711115].model',
        "source_type": 'pr', #TODO Standardize the artifacts 
        "target_type": 'py',
        "path_mappings": '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1596383717.992744].csv', #MUST have bpe8k <----
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "saving_path":  path_data/'se-benchmarking/traceability/cisco/sacp',
        "names": ['Source','Target','Linked?']
    }

In [None]:
path_to_trained_model = path_data + 'models/wv/bpe8k/[word2vec-Java-Py-Wiki-SK-500-20E-8k[12]-1594546477.788739].model'

In [None]:
def sacp_params_bpe():
    return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_to_trained_model,
        "source_type": 'pr', #TODO Standardize the artifacts 
        "target_type": 'py',
        "path_mappings": '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1596383717.992744].csv',
            "sep": '~',
            "names": ['ids','bpe8k'],
            "prep": Preprocessing.bpe
        },
        "saving_path": path_data + 'se-benchmarking/traceability/cisco/sacp',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_data + 'models/bpe/sentencepiece/wiki_py_java_bpe_8k' #For BPE Analysis
    }

In [None]:
#parameters = default_params()
parameters = libest_params()
#parameters = _params()
#parameters = sacp_params_bpe()
#parameters = libest_params_bpe()
parameters

{'vectorizationType': <VectorizationType.word2vec: 1>,
 'linkType': <LinkType.req2src: 2>,
 'system': 'libest',
 'path_to_trained_model': '../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model',
 'source_type': 'req',
 'target_type': 'tc',
 'system_path_config': {'system_path': '../dvc-ds4se/se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
  'sep': '~',
  'names': ['ids', 'conv'],
  'prep': <Preprocessing.conv: 1>},
 'saving_path': '../dvc-ds4se/se-benchmarking/traceability',
 'names': ['Source', 'Target', 'Linked?'],
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_8k',
 'path_mappings': '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt'}

# Artifacts Similarity with Doc2Vec

In [None]:
#[step 1]Creating the Vectorization Class
doc2vec = ds.mining.ir.DOC2VecSeqVect( params = parameters )

2020-12-16 02:24:17,829 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:24:17,886 : INFO : built Dictionary(6957 unique tokens: ['");', '"../../', '("\\', '();', ')))']...) from 87 documents (total 88944 corpus positions)
2020-12-16 02:24:17,888 : INFO : conventional preprocessing documents, dictionary, and vocab for the test corpus
2020-12-16 02:24:17,889 : INFO : loading Word2Vec object from ../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model
2020-12-16 02:24:17,958 : INFO : loading wv recursively from ../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model.wv.* with mmap=None
2020-12-16 02:24:17,960 : INFO : loading vectors from ../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model.wv.vectors.npy with mmap=None
2020-12-16 02:24:17,986 : INFO : setting ignored attribute vectors_norm to None
2020-12-16 02:24:17,988 : INFO : loading vocabulary recursi

In [None]:
#[step 2]NonGroundTruth Computation
metric_list = [DistanceMetric.WMD,DistanceMetric.SCM,EntropyMetric.MSI_I,EntropyMetric.MI]
#metric_list = [EntropyMetric.MSI_I,EntropyMetric.MI]
word2vec.ComputeDistanceArtifacts( sampling=True, samples = 100, metric_list = metric_list )
word2vec.df_nonground_link.head()

2020-12-16 02:25:45,242 : INFO : Removed 60 and 1468 OOV words from document 1 and 2 (respectively).
2020-12-16 02:25:45,243 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:25:45,245 : INFO : built Dictionary(192 unique tokens: ['add', 'address', 'arc', 'attribut', 'author']...) from 2 documents (total 1459 corpus positions)
2020-12-16 02:25:45,471 : INFO : token count processed
2020-12-16 02:25:45,477 : INFO : frequencies processed
2020-12-16 02:25:46,095 : INFO : scalar_distribution processed
2020-12-16 02:25:46,096 : INFO : entropies processed
2020-12-16 02:25:46,097 : INFO : extropies processed
2020-12-16 02:25:46,100 : INFO : token count processed
2020-12-16 02:25:46,101 : INFO : alphabet_source #6957
2020-12-16 02:25:46,102 : INFO : alphabet_target #6957
2020-12-16 02:25:46,103 : INFO : vocab #6957
2020-12-16 02:25:46,105 : INFO : diff #set()
2020-12-16 02:25:47,354 : INFO : alphabet #6957
2020-12-16 02:25:47,974 : INFO : Computed distances or similar

2020-12-16 02:26:02,000 : INFO : token count processed
2020-12-16 02:26:02,002 : INFO : alphabet_source #6957
2020-12-16 02:26:02,003 : INFO : alphabet_target #6957
2020-12-16 02:26:02,004 : INFO : vocab #6957
2020-12-16 02:26:02,006 : INFO : diff #set()
2020-12-16 02:26:03,257 : INFO : alphabet #6957
2020-12-16 02:26:03,882 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ6.txt', 'test_data/LibEST_semeru_format/test/us1060.c')[[1.108496871070142, 0.4742715124317269], [0.8058314025402069, 0.1941686], [2.939884429263502, 1.3213548427323423], [7.625029353358799, 4.1011854841616815]]
2020-12-16 02:26:03,885 : INFO : Removed 19 and 664 OOV words from document 1 and 2 (respectively).
2020-12-16 02:26:03,886 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:26:03,888 : INFO : built Dictionary(146 unique tokens: ['base', 'connect', 'correspond', 'cover', 'data']...) from 2 documents (total 416 corpus positions)
2020-12-16 0

2020-12-16 02:26:20,186 : INFO : Removed 13 and 1349 OOV words from document 1 and 2 (respectively).
2020-12-16 02:26:20,187 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:26:20,190 : INFO : built Dictionary(138 unique tokens: ['access', 'afford', 'defin', 'est', 'full']...) from 2 documents (total 1282 corpus positions)
2020-12-16 02:26:20,239 : INFO : token count processed
2020-12-16 02:26:20,245 : INFO : frequencies processed
2020-12-16 02:26:20,876 : INFO : scalar_distribution processed
2020-12-16 02:26:20,877 : INFO : entropies processed
2020-12-16 02:26:20,880 : INFO : extropies processed
2020-12-16 02:26:20,883 : INFO : token count processed
2020-12-16 02:26:20,885 : INFO : alphabet_source #6957
2020-12-16 02:26:20,886 : INFO : alphabet_target #6957
2020-12-16 02:26:20,887 : INFO : vocab #6957
2020-12-16 02:26:20,889 : INFO : diff #set()
2020-12-16 02:26:22,172 : INFO : alphabet #6957
2020-12-16 02:26:22,808 : INFO : Computed distances or similariti

2020-12-16 02:26:36,961 : INFO : token count processed
2020-12-16 02:26:36,962 : INFO : alphabet_source #6957
2020-12-16 02:26:36,963 : INFO : alphabet_target #6957
2020-12-16 02:26:36,964 : INFO : vocab #6957
2020-12-16 02:26:36,966 : INFO : diff #set()
2020-12-16 02:26:38,223 : INFO : alphabet #6957
2020-12-16 02:26:38,844 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ14.txt', 'test_data/LibEST_semeru_format/test/us903.c')[[1.1013643084569777, 0.4758813100496104], [0.7730646878480911, 0.22693531], [4.005836087367967, 1.3705076892564731], [8.75622759695926, 5.937295746858414]]
2020-12-16 02:26:38,847 : INFO : Removed 27 and 664 OOV words from document 1 and 2 (respectively).
2020-12-16 02:26:38,848 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:26:38,850 : INFO : built Dictionary(137 unique tokens: ['accord', 'advis', 'allow', 'attribut', 'author']...) from 2 documents (total 411 corpus positions)
2020-12-16 0

2020-12-16 02:26:55,248 : INFO : Removed 30 and 664 OOV words from document 1 and 2 (respectively).
2020-12-16 02:26:55,249 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:26:55,250 : INFO : built Dictionary(136 unique tokens: ['author', 'band', 'base', 'client', 'est']...) from 2 documents (total 409 corpus positions)
2020-12-16 02:26:55,307 : INFO : token count processed
2020-12-16 02:26:55,320 : INFO : frequencies processed
2020-12-16 02:26:55,950 : INFO : scalar_distribution processed
2020-12-16 02:26:55,951 : INFO : entropies processed
2020-12-16 02:26:55,952 : INFO : extropies processed
2020-12-16 02:26:55,954 : INFO : token count processed
2020-12-16 02:26:55,955 : INFO : alphabet_source #6957
2020-12-16 02:26:55,957 : INFO : alphabet_target #6957
2020-12-16 02:26:55,957 : INFO : vocab #6957
2020-12-16 02:26:55,959 : INFO : diff #set()
2020-12-16 02:26:57,238 : INFO : alphabet #6957
2020-12-16 02:26:57,859 : INFO : Computed distances or similarities 

2020-12-16 02:27:12,327 : INFO : token count processed
2020-12-16 02:27:12,329 : INFO : alphabet_source #6957
2020-12-16 02:27:12,330 : INFO : alphabet_target #6957
2020-12-16 02:27:12,331 : INFO : vocab #6957
2020-12-16 02:27:12,332 : INFO : diff #set()
2020-12-16 02:27:13,590 : INFO : alphabet #6957
2020-12-16 02:27:14,318 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ32.txt', 'test_data/LibEST_semeru_format/test/us1864.c')[[1.0570176580637678, 0.4861406979565169], [0.723444014787674, 0.276556], [4.111698692943856, 1.390201917474914], [8.739508040451842, 5.897158532459386]]
2020-12-16 02:27:14,321 : INFO : Removed 59 and 577 OOV words from document 1 and 2 (respectively).
2020-12-16 02:27:14,322 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:27:14,323 : INFO : built Dictionary(152 unique tokens: ['anchor', 'author', 'avail', 'behind', 'browser']...) from 2 documents (total 650 corpus positions)
2020-12-16 02:

2020-12-16 02:27:30,277 : INFO : Removed 138 and 1468 OOV words from document 1 and 2 (respectively).
2020-12-16 02:27:30,278 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:27:30,281 : INFO : built Dictionary(196 unique tokens: ['132', 'access', 'address', 'attr', 'attribut']...) from 2 documents (total 1574 corpus positions)
2020-12-16 02:27:30,546 : INFO : token count processed
2020-12-16 02:27:30,556 : INFO : frequencies processed
2020-12-16 02:27:31,172 : INFO : scalar_distribution processed
2020-12-16 02:27:31,173 : INFO : entropies processed
2020-12-16 02:27:31,173 : INFO : extropies processed
2020-12-16 02:27:31,176 : INFO : token count processed
2020-12-16 02:27:31,178 : INFO : alphabet_source #6957
2020-12-16 02:27:31,179 : INFO : alphabet_target #6957
2020-12-16 02:27:31,180 : INFO : vocab #6957
2020-12-16 02:27:31,181 : INFO : diff #set()
2020-12-16 02:27:32,440 : INFO : alphabet #6957
2020-12-16 02:27:33,061 : INFO : Computed distances or simil

2020-12-16 02:27:47,040 : INFO : extropies processed
2020-12-16 02:27:47,042 : INFO : token count processed
2020-12-16 02:27:47,044 : INFO : alphabet_source #6957
2020-12-16 02:27:47,045 : INFO : alphabet_target #6957
2020-12-16 02:27:47,046 : INFO : vocab #6957
2020-12-16 02:27:47,047 : INFO : diff #set()
2020-12-16 02:27:48,302 : INFO : alphabet #6957
2020-12-16 02:27:48,922 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ23.txt', 'test_data/LibEST_semeru_format/test/us3496.c')[[1.150531849841959, 0.4650012507712868], [0.8678732961416245, 0.1321267], [2.972253928364927, 1.311764078962958], [7.444103259461646, 5.177130113905775]]
2020-12-16 02:27:48,925 : INFO : Removed 47 and 2029 OOV words from document 1 and 2 (respectively).
2020-12-16 02:27:48,926 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:27:48,929 : INFO : built Dictionary(190 unique tokens: ['altern', 'author', 'base', 'cbc', 'check']...) from 2 docu

2020-12-16 02:28:05,005 : INFO : Removed 59 and 2062 OOV words from document 1 and 2 (respectively).
2020-12-16 02:28:05,006 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:28:05,008 : INFO : built Dictionary(159 unique tokens: ['anchor', 'author', 'avail', 'behind', 'browser']...) from 2 documents (total 746 corpus positions)
2020-12-16 02:28:05,167 : INFO : token count processed
2020-12-16 02:28:05,173 : INFO : frequencies processed
2020-12-16 02:28:05,802 : INFO : scalar_distribution processed
2020-12-16 02:28:05,803 : INFO : entropies processed
2020-12-16 02:28:05,804 : INFO : extropies processed
2020-12-16 02:28:05,807 : INFO : token count processed
2020-12-16 02:28:05,808 : INFO : alphabet_source #6957
2020-12-16 02:28:05,809 : INFO : alphabet_target #6957
2020-12-16 02:28:05,810 : INFO : vocab #6957
2020-12-16 02:28:05,812 : INFO : diff #set()
2020-12-16 02:28:07,060 : INFO : alphabet #6957
2020-12-16 02:28:07,679 : INFO : Computed distances or simil

2020-12-16 02:28:22,012 : INFO : token count processed
2020-12-16 02:28:22,015 : INFO : alphabet_source #6957
2020-12-16 02:28:22,017 : INFO : alphabet_target #6957
2020-12-16 02:28:22,018 : INFO : vocab #6957
2020-12-16 02:28:22,020 : INFO : diff #set()
2020-12-16 02:28:23,270 : INFO : alphabet #6957
2020-12-16 02:28:23,889 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ14.txt', 'test_data/LibEST_semeru_format/test/us899.c')[[1.0634199184552562, 0.4846323286190979], [0.7152673006057739, 0.2847327], [4.101681489107106, 1.3817417127226523], [8.271135450796873, 5.953870679314438]]
2020-12-16 02:28:23,893 : INFO : Removed 33 and 1630 OOV words from document 1 and 2 (respectively).
2020-12-16 02:28:23,893 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:28:23,895 : INFO : built Dictionary(193 unique tokens: ['accept', 'access', 'act', 'author', 'avail']...) from 2 documents (total 891 corpus positions)
2020-12-16 02:2

2020-12-16 02:28:40,301 : INFO : Removed 145 and 1444 OOV words from document 1 and 2 (respectively).
2020-12-16 02:28:40,301 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:28:40,303 : INFO : built Dictionary(205 unique tokens: ['802', 'also', 'anchor', 'aspect', 'attribut']...) from 2 documents (total 1325 corpus positions)
2020-12-16 02:28:40,606 : INFO : token count processed
2020-12-16 02:28:40,616 : INFO : frequencies processed
2020-12-16 02:28:41,237 : INFO : scalar_distribution processed
2020-12-16 02:28:41,238 : INFO : entropies processed
2020-12-16 02:28:41,238 : INFO : extropies processed
2020-12-16 02:28:41,244 : INFO : token count processed
2020-12-16 02:28:41,246 : INFO : alphabet_source #6957
2020-12-16 02:28:41,248 : INFO : alphabet_target #6957
2020-12-16 02:28:41,249 : INFO : vocab #6957
2020-12-16 02:28:41,252 : INFO : diff #set()
2020-12-16 02:28:42,516 : INFO : alphabet #6957
2020-12-16 02:28:43,135 : INFO : Computed distances or simila

2020-12-16 02:28:57,518 : INFO : extropies processed
2020-12-16 02:28:57,523 : INFO : token count processed
2020-12-16 02:28:57,525 : INFO : alphabet_source #6957
2020-12-16 02:28:57,528 : INFO : alphabet_target #6957
2020-12-16 02:28:57,529 : INFO : vocab #6957
2020-12-16 02:28:57,530 : INFO : diff #set()
2020-12-16 02:28:58,780 : INFO : alphabet #6957
2020-12-16 02:28:59,424 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ49.txt', 'test_data/LibEST_semeru_format/test/us748.c')[[1.0330329954968793, 0.491875932272116], [0.7348989546298981, 0.26510105], [4.739485167910035, 1.406924120604358], [8.83056177220314, 6.325418002026327]]
2020-12-16 02:28:59,428 : INFO : Removed 19 and 1194 OOV words from document 1 and 2 (respectively).
2020-12-16 02:28:59,428 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:28:59,430 : INFO : built Dictionary(168 unique tokens: ['base', 'connect', 'correspond', 'cover', 'data']...) from 2

2020-12-16 02:29:15,800 : INFO : Removed 19 and 1630 OOV words from document 1 and 2 (respectively).
2020-12-16 02:29:15,801 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:29:15,802 : INFO : built Dictionary(185 unique tokens: ['base', 'connect', 'correspond', 'cover', 'data']...) from 2 documents (total 869 corpus positions)
2020-12-16 02:29:15,922 : INFO : token count processed
2020-12-16 02:29:15,928 : INFO : frequencies processed
2020-12-16 02:29:16,548 : INFO : scalar_distribution processed
2020-12-16 02:29:16,549 : INFO : entropies processed
2020-12-16 02:29:16,550 : INFO : extropies processed
2020-12-16 02:29:16,555 : INFO : token count processed
2020-12-16 02:29:16,557 : INFO : alphabet_source #6957
2020-12-16 02:29:16,560 : INFO : alphabet_target #6957
2020-12-16 02:29:16,561 : INFO : vocab #6957
2020-12-16 02:29:16,562 : INFO : diff #set()
2020-12-16 02:29:17,822 : INFO : alphabet #6957
2020-12-16 02:29:18,447 : INFO : Computed distances or simil

2020-12-16 02:29:32,488 : INFO : token count processed
2020-12-16 02:29:32,489 : INFO : alphabet_source #6957
2020-12-16 02:29:32,490 : INFO : alphabet_target #6957
2020-12-16 02:29:32,491 : INFO : vocab #6957
2020-12-16 02:29:32,493 : INFO : diff #set()
2020-12-16 02:29:33,749 : INFO : alphabet #6957
2020-12-16 02:29:34,376 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ57.txt', 'test_data/LibEST_semeru_format/test/us3612.c')[[1.0771117270418926, 0.481437751749707], [0.8207048326730728, 0.17929517], [4.003103797225431, 1.3871060048624124], [8.352713831612732, 7.141824602044302]]
2020-12-16 02:29:34,379 : INFO : Removed 27 and 1921 OOV words from document 1 and 2 (respectively).
2020-12-16 02:29:34,380 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:29:34,382 : INFO : built Dictionary(117 unique tokens: ['accord', 'advis', 'allow', 'attribut', 'author']...) from 2 documents (total 556 corpus positions)
2020-12-16

2020-12-16 02:29:50,132 : INFO : Removed 32 and 1161 OOV words from document 1 and 2 (respectively).
2020-12-16 02:29:50,133 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:29:50,135 : INFO : built Dictionary(147 unique tokens: ['accept', 'access', 'also', 'altern', 'author']...) from 2 documents (total 1019 corpus positions)
2020-12-16 02:29:50,260 : INFO : token count processed
2020-12-16 02:29:50,266 : INFO : frequencies processed
2020-12-16 02:29:50,888 : INFO : scalar_distribution processed
2020-12-16 02:29:50,889 : INFO : entropies processed
2020-12-16 02:29:50,890 : INFO : extropies processed
2020-12-16 02:29:50,893 : INFO : token count processed
2020-12-16 02:29:50,894 : INFO : alphabet_source #6957
2020-12-16 02:29:50,895 : INFO : alphabet_target #6957
2020-12-16 02:29:50,896 : INFO : vocab #6957
2020-12-16 02:29:50,898 : INFO : diff #set()
2020-12-16 02:29:52,147 : INFO : alphabet #6957
2020-12-16 02:29:52,772 : INFO : Computed distances or simila

2020-12-16 02:30:07,328 : INFO : token count processed
2020-12-16 02:30:07,329 : INFO : alphabet_source #6957
2020-12-16 02:30:07,331 : INFO : alphabet_target #6957
2020-12-16 02:30:07,331 : INFO : vocab #6957
2020-12-16 02:30:07,333 : INFO : diff #set()
2020-12-16 02:30:08,584 : INFO : alphabet #6957
2020-12-16 02:30:09,210 : INFO : Computed distances or similarities ('test_data/LibEST_semeru_format/requirements/RQ20.txt', 'test_data/LibEST_semeru_format/test/us900.c')[[1.1447020552638465, 0.46626523136192805], [0.9458929188549519, 0.05410708], [3.272812967681858, 1.3375600856964762], [7.2755988615781275, 5.3029932265755635]]
2020-12-16 02:30:09,213 : INFO : Removed 31 and 939 OOV words from document 1 and 2 (respectively).
2020-12-16 02:30:09,214 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 02:30:09,216 : INFO : built Dictionary(149 unique tokens: ['author', 'base', 'cbc', 'check', 'client']...) from 2 documents (total 718 corpus positions)
2020-12-16 02:

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us894.c,1.055692,0.486454,0.80341,0.19659,4.685972,1.410394,7.920854,6.15828
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us898.c,1.115621,0.472674,0.799947,0.200053,3.653757,1.376497,7.226829,4.598405
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.182004,0.458294,0.953313,0.046687,2.873141,1.32933,7.190003,4.580062
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.079827,0.480809,0.947533,0.052467,4.367133,1.395704,7.502431,5.776133
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us3496.c,1.050227,0.487751,0.791415,0.208585,3.520787,1.357829,7.457979,5.326814


In [None]:
word2vec.df_nonground_link.head()

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us894.c,1.055692,0.486454,0.80341,0.19659,4.685972,1.410394,7.920854,6.15828
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us898.c,1.115621,0.472674,0.799947,0.200053,3.653757,1.376497,7.226829,4.598405
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.182004,0.458294,0.953313,0.046687,2.873141,1.32933,7.190003,4.580062
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.079827,0.480809,0.947533,0.052467,4.367133,1.395704,7.502431,5.776133
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us3496.c,1.050227,0.487751,0.791415,0.208585,3.520787,1.357829,7.457979,5.326814


In [None]:
#[step 3]Saving Non-GroundTruth Links
word2vec.SaveLinks()

2020-12-16 02:30:30,466 : INFO : Saving in...../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-False-1608085830.45751].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = ds.mining.ir.LoadLinks(timestamp=1608085830.45751, params=parameters)
df_nonglinks.head()

2020-12-16 02:31:43,782 : INFO : Loading computed links from... ../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-False-1608085830.45751].csv


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us894.c,1.055692,0.486454,0.80341,0.19659,4.685972,1.410394,7.920854,6.15828
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us898.c,1.115621,0.472674,0.799947,0.200053,3.653757,1.376497,7.226829,4.598405
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.182004,0.458294,0.953313,0.046687,2.873141,1.32933,7.190003,4.580062
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.079827,0.480809,0.947533,0.052467,4.367133,1.395704,7.502431,5.776133
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us3496.c,1.050227,0.487751,0.791415,0.208585,3.520787,1.357829,7.457979,5.326814


In [None]:
#[step 4]GroundTruthMatching Testing
#TODO change the path for a param
path_to_ground_truth = '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt'
word2vec.MatchWithGroundTruth(path_to_ground_truth, semeru_format=True)
word2vec.df_ground_link

2020-12-16 02:32:12,562 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,566 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,570 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,574 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,578 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,582 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,585 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,587 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,590 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,593 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,595 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,598 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,600 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,602 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,605 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,607 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:12,610 

2020-12-16 02:32:13,217 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,219 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,222 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,224 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,226 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,229 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,231 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,233 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,236 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,238 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,240 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,243 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,245 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,248 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,250 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,253 : INFO : findDistInDF: semeru_format
2020-12-16 02:32:13,255 

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us894.c,1.055692,0.486454,0.803410,0.196590,4.685972,1.410394,7.920854,6.158280,0.0
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us898.c,1.115621,0.472674,0.799947,0.200053,3.653757,1.376497,7.226829,4.598405,0.0
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.182004,0.458294,0.953313,0.046687,2.873141,1.329330,7.190003,4.580062,0.0
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.079827,0.480809,0.947533,0.052467,4.367133,1.395704,7.502431,5.776133,0.0
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us3496.c,1.050227,0.487751,0.791415,0.208585,3.520787,1.357829,7.457979,5.326814,1.0
...,...,...,...,...,...,...,...,...,...,...,...
95,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1864.c,0.956124,0.511215,0.688852,0.311148,4.401750,1.392002,8.618406,5.614617,1.0
96,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us748.c,1.033817,0.491686,0.792230,0.207770,4.682616,1.405861,8.772834,6.053077,0.0
97,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.144702,0.466265,0.945893,0.054107,3.272813,1.337560,7.275599,5.302993,1.0
98,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,1.021799,0.494609,0.795305,0.204695,4.068451,1.389074,7.686388,5.187485,0.0


In [None]:
#[step 4.1]GroundTruthMatching Testing For CISCO Mappings
word2vec.MatchWithGroundTruth(from_mappings=True)
word2vec.df_ground_link

In [None]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links[141] # A tuple

('RQ33.txt', 'us894.c')

In [None]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

2020-12-16 02:32:42,445 : INFO : Saving in...../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-True-1608085962.438394].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = ds.mining.ir.LoadLinks(timestamp=1608085962.438394, params=parameters,grtruth = True)
df_glinks.head()

2020-12-16 02:32:58,325 : INFO : Loading computed links from... ../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-True-1608085962.438394].csv


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us894.c,1.055692,0.486454,0.80341,0.19659,4.685972,1.410394,7.920854,6.15828,0.0
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us898.c,1.115621,0.472674,0.799947,0.200053,3.653757,1.376497,7.226829,4.598405,0.0
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.182004,0.458294,0.953313,0.046687,2.873141,1.32933,7.190003,4.580062,0.0
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us900.c,1.079827,0.480809,0.947533,0.052467,4.367133,1.395704,7.502431,5.776133,0.0
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us3496.c,1.050227,0.487751,0.791415,0.208585,3.520787,1.357829,7.457979,5.326814,1.0
