In [1]:
# default_exp experiment.mining.ir.unsupervised.w2v

# Experimenting Neural Unsupervised Approaches for Software Information Retrieval [w2v]

> Just Paper. Full Experimentation. This module is dedicated to experiment with word2vec. Consider to Copy the entire notebook for a new and separeted empirical evaluation. 
> Implementing mutual information analysis
> Author: @danaderp April 2020
> Author: @danielrc Nov 2020

This copy is for Cisco purposes. It was adapted to process private github data from cisco. 

In [2]:
from ds4se.mining.ir import *

In [3]:
from prg import prg

In [4]:
import ds4se as ds

In [5]:
import numpy as np

In [6]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Artifacts Similarity with BasicSequenceVectorization

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

## Experients Set-up

In [7]:
path_data = '../dvc-ds4se/' #dataset path

In [16]:
#Experiments 1.1.2 <<-- word2vec
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_32k'
path_to_trained_model = path_data+'/models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model'
def sacp_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.PR.value,
        "target_type": SoftwareArtifacts.PY.value,
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
            "sep": '~',
            "names": ['ids','bpe32k'],
            "prep": Preprocessing.bpe
        },
        "path_mappings": "/tf/data/cisco/sacp_data/sacp-pr-mappings.csv",
        "saving_path": path_data + 'metrics/traceability/experiments1.1.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix": path_model_prefix

        }

In [17]:
parameters = sacp_params()
parameters

{'vectorizationType': <VectorizationType.word2vec: 1>,
 'linkType': <LinkType.issue2src: 3>,
 'system': 'sacp-python-common',
 'path_to_trained_model': '../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model',
 'source_type': 'pr',
 'target_type': 'py',
 'system_path_config': {'system_path': '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
  'sep': '~',
  'names': ['ids', 'bpe32k'],
  'prep': <Preprocessing.bpe: 2>},
 'path_mappings': '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
 'saving_path': '../dvc-ds4se/metrics/traceability/experiments1.1.x/',
 'names': ['Source', 'Target', 'Linked?'],
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_32k'}

# Artifacts Similarity with Word2Vec

In [12]:
#[step 1]Creating the Vectorization Class
word2vec = ds.mining.ir.Word2VecSeqVect( params = parameters, logging = logging )

2021-02-23 02:16:28,048 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-23 02:16:28,113 : INFO : built Dictionary(3010 unique tokens: ['28', '29', '3', '4)', '7']...) from 362 documents (total 171602 corpus positions)
2021-02-23 02:16:28,304 : INFO : Ignored vocab by BPE{'γ', '\r\n\r\n\r\n', '^', '\\', '\r\n\r\n', '\t', '@', '\r\n\r\n@', '\r\n', '```', '`'}
2021-02-23 02:16:28,305 : INFO : bpe preprocessing documents, dictionary, and vocab for the test corpus
2021-02-23 02:16:28,306 : INFO : loading Word2Vec object from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model
2021-02-23 02:16:28,851 : INFO : loading wv recursively from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model.wv.* with mmap=None
2021-02-23 02:16:28,852 : INFO : setting ignored attribute vectors_norm to None
2021-02-23 02:16:28,854 : INFO : loading vocabulary recursively from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py

In [None]:
#[step 2]NonGroundTruth Computation
metric_list = [DistanceMetric.WMD,DistanceMetric.SCM,EntropyMetric.MSI_I,EntropyMetric.MI]
#metric_list = [EntropyMetric.MSI_I,EntropyMetric.MI]
word2vec.ComputeDistanceArtifacts( sampling=False, samples = 100, metric_list = metric_list )
word2vec.df_nonground_link.head()

In [None]:
word2vec.df_nonground_link.head()

In [None]:
word2vec.df_nonground_link['Target'][1]

In [None]:
#[step 3]Saving Non-GroundTruth Links
word2vec.SaveLinks()

In [10]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = ds.mining.ir.LoadLinks(timestamp=1614004624.212459, params=parameters, logging=logging)
df_nonglinks.head()

2021-02-23 02:16:19,184 : INFO : Loading computed links from... ../dvc-ds4se/metrics/traceability/experiments1.1.x/[sacp-python-common-VectorizationType.word2vec-LinkType.issue2src-False-1614004624.212459].csv
2021-02-23 02:16:19,251 : INFO : df_x.dtypesSource                        object
Target                        object
DistanceMetric.WMD           float64
SimilarityMetric.WMD_sim     float64
DistanceMetric.SCM           float64
SimilarityMetric.SCM_sim     float64
EntropyMetric.MSI_I          float64
EntropyMetric.MSI_X          float64
EntropyMetric.Entropy_src    float64
EntropyMetric.Entropy_tgt    float64
EntropyMetric.JI             float64
EntropyMetric.MI             float64
EntropyMetric.Loss           float64
EntropyMetric.Noise          float64
dtype: object


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
0,295,sacp-python-common/sacp_python_common/auth_uti...,1.251702,0.444109,0.940516,0.059484,2.251629,1.266756,4.518397,6.905617,7.074863,4.349151,2.556466,0.169246
1,295,sacp-python-common/sacp_python_common/bandit/b...,1.260697,0.442341,0.948267,0.051733,2.251629,1.266756,4.518397,7.121928,7.36282,4.277506,2.844423,0.240891
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.253958,0.443664,0.944901,0.055099,2.5,1.296814,4.518397,6.41099,6.507248,4.422139,1.988851,0.096258
3,295,sacp-python-common/sacp_python_common/cave/cav...,1.224535,0.449532,0.93303,0.06697,1.921928,1.214807,4.518397,6.077867,6.289934,4.306329,1.771537,0.212068
4,295,sacp-python-common/sacp_python_common/cave/cav...,1.225771,0.449282,0.922901,0.077099,2.0,1.245112,4.518397,5.977547,6.207226,4.288719,1.688829,0.229678


In [13]:
word2vec.df_nonground_link = df_nonglinks # Only to load links from file

In [None]:
#[step 4]GroundTruthMatching Testing
#TODO change the path for a param
path_to_ground_truth =  parameters['path_mappings']
word2vec.MatchWithGroundTruth(path_to_ground_truth, semeru_format=True)
word2vec.df_ground_link

In [20]:
word2vec.df_ground_link[word2vec.df_ground_link ['Linked?']==1]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise,Linked?
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.253958,0.443664,0.944901,0.055099,2.500000,1.296814,4.518397,6.410990,6.507248,4.422139,1.988851,0.096258,1.0
7,295,sacp-python-common/sacp_python_common/csbcicd_...,1.225169,0.449404,0.927214,0.072786,2.845351,1.321020,4.518397,6.907906,7.023516,4.402787,2.505119,0.115610,1.0
13,295,sacp-python-common/sacp_python_common/gosec/go...,1.252137,0.444023,0.936625,0.063375,2.725481,1.319220,4.518397,6.957797,7.116996,4.359198,2.598599,0.159199,1.0
17,295,sacp-python-common/sacp_python_common/psb_mapp...,1.255990,0.443264,0.942938,0.057062,2.321928,1.287712,4.518397,6.560342,6.768631,4.310108,2.250234,0.208289,1.0
19,295,sacp-python-common/sacp_python_common/security...,1.214594,0.451550,0.924579,0.075421,3.121928,1.351965,4.518397,7.009230,7.126023,4.401604,2.607626,0.116793,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18141,43,sacp-python-common/sacp_python_common/fireExce...,1.229768,0.448477,0.902336,0.097664,0.000000,0.000000,3.511085,5.176619,5.570019,3.117685,2.058933,0.393400,1.0
18215,44,sacp-python-common/sacp_python_common/fireExce...,1.265874,0.441331,0.941118,0.058882,0.000000,0.000000,3.180833,5.176619,5.498465,2.858987,2.317632,0.321846,1.0
18289,42,sacp-python-common/sacp_python_common/fireExce...,1.194914,0.455599,0.866706,0.133294,1.000000,1.000000,4.436605,5.176619,5.749707,3.863517,1.313102,0.573089,1.0
18955,33,sacp-python-common/sacp_python_common/fireExce...,1.200532,0.454436,0.848926,0.151074,1.000000,1.000000,2.807355,5.176619,5.377367,2.606607,2.570012,0.200748,1.0


In [None]:
word2vec.df_ground_link['Source'][0]

## 4.1 Only SACP

In [14]:
#[step 4.1]GroundTruthMatching Testing For CISCO Mappings
word2vec.MatchWithGroundTruth(from_mappings=True)
word2vec.df_ground_link

2021-02-23 02:16:58,396 : INFO : ground truth from mappings
2021-02-23 02:16:58,397 : INFO : processing from mappings SACP
2021-02-23 02:16:58,405 : INFO : processing from mappings SACP
2021-02-23 02:16:58,413 : INFO : processing from mappings SACP
2021-02-23 02:16:58,420 : INFO : processing from mappings SACP
2021-02-23 02:16:58,427 : INFO : processing from mappings SACP
2021-02-23 02:16:58,433 : INFO : processing from mappings SACP
2021-02-23 02:16:58,440 : INFO : processing from mappings SACP
2021-02-23 02:16:58,448 : INFO : processing from mappings SACP
2021-02-23 02:16:58,461 : INFO : processing from mappings SACP
2021-02-23 02:16:58,469 : INFO : processing from mappings SACP
2021-02-23 02:16:58,475 : INFO : processing from mappings SACP
2021-02-23 02:16:58,482 : INFO : processing from mappings SACP
2021-02-23 02:16:58,490 : INFO : processing from mappings SACP
2021-02-23 02:16:58,497 : INFO : processing from mappings SACP
2021-02-23 02:16:58,504 : INFO : processing from mappings 

2021-02-23 02:16:59,345 : INFO : processing from mappings SACP
2021-02-23 02:16:59,352 : INFO : processing from mappings SACP
2021-02-23 02:16:59,359 : INFO : processing from mappings SACP
2021-02-23 02:16:59,366 : INFO : processing from mappings SACP
2021-02-23 02:16:59,373 : INFO : processing from mappings SACP
2021-02-23 02:16:59,380 : INFO : processing from mappings SACP
2021-02-23 02:16:59,387 : INFO : processing from mappings SACP
2021-02-23 02:16:59,394 : INFO : processing from mappings SACP
2021-02-23 02:16:59,401 : INFO : processing from mappings SACP
2021-02-23 02:16:59,408 : INFO : processing from mappings SACP
2021-02-23 02:16:59,414 : INFO : processing from mappings SACP
2021-02-23 02:16:59,421 : INFO : processing from mappings SACP
2021-02-23 02:16:59,429 : INFO : processing from mappings SACP
2021-02-23 02:16:59,436 : INFO : processing from mappings SACP
2021-02-23 02:16:59,443 : INFO : processing from mappings SACP
2021-02-23 02:16:59,450 : INFO : processing from mappin

2021-02-23 02:17:00,287 : INFO : processing from mappings SACP
2021-02-23 02:17:00,294 : INFO : processing from mappings SACP
2021-02-23 02:17:00,302 : INFO : processing from mappings SACP
2021-02-23 02:17:00,309 : INFO : processing from mappings SACP
2021-02-23 02:17:00,317 : INFO : processing from mappings SACP
2021-02-23 02:17:00,324 : INFO : processing from mappings SACP
2021-02-23 02:17:00,332 : INFO : processing from mappings SACP
2021-02-23 02:17:00,339 : INFO : processing from mappings SACP
2021-02-23 02:17:00,347 : INFO : processing from mappings SACP
2021-02-23 02:17:00,354 : INFO : processing from mappings SACP
2021-02-23 02:17:00,360 : INFO : processing from mappings SACP
2021-02-23 02:17:00,368 : INFO : processing from mappings SACP
2021-02-23 02:17:00,375 : INFO : processing from mappings SACP
2021-02-23 02:17:00,383 : INFO : processing from mappings SACP
2021-02-23 02:17:00,390 : INFO : processing from mappings SACP
2021-02-23 02:17:00,397 : INFO : processing from mappin

2021-02-23 02:17:01,238 : INFO : processing from mappings SACP
2021-02-23 02:17:01,246 : INFO : processing from mappings SACP
2021-02-23 02:17:01,253 : INFO : processing from mappings SACP
2021-02-23 02:17:01,260 : INFO : processing from mappings SACP
2021-02-23 02:17:01,267 : INFO : processing from mappings SACP
2021-02-23 02:17:01,274 : INFO : processing from mappings SACP
2021-02-23 02:17:01,281 : INFO : processing from mappings SACP
2021-02-23 02:17:01,288 : INFO : processing from mappings SACP
2021-02-23 02:17:01,295 : INFO : processing from mappings SACP
2021-02-23 02:17:01,302 : INFO : processing from mappings SACP
2021-02-23 02:17:01,309 : INFO : processing from mappings SACP
2021-02-23 02:17:01,316 : INFO : processing from mappings SACP
2021-02-23 02:17:01,323 : INFO : processing from mappings SACP
2021-02-23 02:17:01,330 : INFO : processing from mappings SACP
2021-02-23 02:17:01,337 : INFO : processing from mappings SACP
2021-02-23 02:17:01,343 : INFO : processing from mappin

2021-02-23 02:17:02,179 : INFO : processing from mappings SACP
2021-02-23 02:17:02,186 : INFO : processing from mappings SACP
2021-02-23 02:17:02,193 : INFO : processing from mappings SACP
2021-02-23 02:17:02,200 : INFO : processing from mappings SACP
2021-02-23 02:17:02,207 : INFO : processing from mappings SACP
2021-02-23 02:17:02,214 : INFO : processing from mappings SACP
2021-02-23 02:17:02,221 : INFO : processing from mappings SACP
2021-02-23 02:17:02,228 : INFO : processing from mappings SACP
2021-02-23 02:17:02,235 : INFO : processing from mappings SACP
2021-02-23 02:17:02,242 : INFO : processing from mappings SACP
2021-02-23 02:17:02,249 : INFO : processing from mappings SACP
2021-02-23 02:17:02,256 : INFO : processing from mappings SACP
2021-02-23 02:17:02,263 : INFO : processing from mappings SACP
2021-02-23 02:17:02,270 : INFO : processing from mappings SACP
2021-02-23 02:17:02,277 : INFO : processing from mappings SACP
2021-02-23 02:17:02,284 : INFO : processing from mappin

2021-02-23 02:17:03,112 : INFO : processing from mappings SACP
2021-02-23 02:17:03,118 : INFO : processing from mappings SACP
2021-02-23 02:17:03,125 : INFO : processing from mappings SACP
2021-02-23 02:17:03,132 : INFO : processing from mappings SACP
2021-02-23 02:17:03,138 : INFO : processing from mappings SACP
2021-02-23 02:17:03,145 : INFO : processing from mappings SACP
2021-02-23 02:17:03,152 : INFO : processing from mappings SACP
2021-02-23 02:17:03,159 : INFO : processing from mappings SACP
2021-02-23 02:17:03,166 : INFO : processing from mappings SACP
2021-02-23 02:17:03,173 : INFO : processing from mappings SACP
2021-02-23 02:17:03,180 : INFO : processing from mappings SACP
2021-02-23 02:17:03,193 : INFO : processing from mappings SACP
2021-02-23 02:17:03,199 : INFO : processing from mappings SACP
2021-02-23 02:17:03,206 : INFO : processing from mappings SACP
2021-02-23 02:17:03,213 : INFO : processing from mappings SACP
2021-02-23 02:17:03,220 : INFO : processing from mappin

2021-02-23 02:17:04,073 : INFO : processing from mappings SACP
2021-02-23 02:17:04,081 : INFO : processing from mappings SACP
2021-02-23 02:17:04,093 : INFO : processing from mappings SACP
2021-02-23 02:17:04,101 : INFO : processing from mappings SACP
2021-02-23 02:17:04,108 : INFO : processing from mappings SACP
2021-02-23 02:17:04,115 : INFO : processing from mappings SACP
2021-02-23 02:17:04,121 : INFO : processing from mappings SACP
2021-02-23 02:17:04,128 : INFO : processing from mappings SACP
2021-02-23 02:17:04,134 : INFO : processing from mappings SACP
2021-02-23 02:17:04,140 : INFO : processing from mappings SACP
2021-02-23 02:17:04,152 : INFO : processing from mappings SACP
2021-02-23 02:17:04,161 : INFO : processing from mappings SACP
2021-02-23 02:17:04,169 : INFO : processing from mappings SACP
2021-02-23 02:17:04,176 : INFO : processing from mappings SACP
2021-02-23 02:17:04,182 : INFO : processing from mappings SACP
2021-02-23 02:17:04,189 : INFO : processing from mappin

2021-02-23 02:17:05,009 : INFO : processing from mappings SACP
2021-02-23 02:17:05,016 : INFO : processing from mappings SACP
2021-02-23 02:17:05,022 : INFO : processing from mappings SACP
2021-02-23 02:17:05,029 : INFO : processing from mappings SACP
2021-02-23 02:17:05,036 : INFO : processing from mappings SACP
2021-02-23 02:17:05,043 : INFO : processing from mappings SACP
2021-02-23 02:17:05,050 : INFO : processing from mappings SACP
2021-02-23 02:17:05,056 : INFO : processing from mappings SACP
2021-02-23 02:17:05,063 : INFO : processing from mappings SACP
2021-02-23 02:17:05,071 : INFO : processing from mappings SACP
2021-02-23 02:17:05,090 : INFO : processing from mappings SACP
2021-02-23 02:17:05,097 : INFO : processing from mappings SACP
2021-02-23 02:17:05,107 : INFO : processing from mappings SACP
2021-02-23 02:17:05,114 : INFO : processing from mappings SACP
2021-02-23 02:17:05,121 : INFO : processing from mappings SACP
2021-02-23 02:17:05,128 : INFO : processing from mappin

2021-02-23 02:17:05,955 : INFO : processing from mappings SACP
2021-02-23 02:17:05,961 : INFO : processing from mappings SACP
2021-02-23 02:17:05,969 : INFO : processing from mappings SACP
2021-02-23 02:17:05,976 : INFO : processing from mappings SACP
2021-02-23 02:17:05,983 : INFO : processing from mappings SACP
2021-02-23 02:17:05,990 : INFO : processing from mappings SACP
2021-02-23 02:17:05,997 : INFO : processing from mappings SACP
2021-02-23 02:17:06,004 : INFO : processing from mappings SACP
2021-02-23 02:17:06,012 : INFO : processing from mappings SACP
2021-02-23 02:17:06,018 : INFO : processing from mappings SACP
2021-02-23 02:17:06,025 : INFO : processing from mappings SACP
2021-02-23 02:17:06,032 : INFO : processing from mappings SACP
2021-02-23 02:17:06,039 : INFO : processing from mappings SACP
2021-02-23 02:17:06,047 : INFO : processing from mappings SACP
2021-02-23 02:17:06,054 : INFO : processing from mappings SACP
2021-02-23 02:17:06,061 : INFO : processing from mappin

2021-02-23 02:17:06,916 : INFO : processing from mappings SACP
2021-02-23 02:17:06,923 : INFO : processing from mappings SACP
2021-02-23 02:17:06,931 : INFO : processing from mappings SACP
2021-02-23 02:17:06,938 : INFO : processing from mappings SACP
2021-02-23 02:17:06,945 : INFO : processing from mappings SACP
2021-02-23 02:17:06,952 : INFO : processing from mappings SACP
2021-02-23 02:17:06,960 : INFO : processing from mappings SACP
2021-02-23 02:17:06,967 : INFO : processing from mappings SACP
2021-02-23 02:17:06,980 : INFO : processing from mappings SACP
2021-02-23 02:17:06,992 : INFO : processing from mappings SACP
2021-02-23 02:17:06,999 : INFO : processing from mappings SACP
2021-02-23 02:17:07,006 : INFO : processing from mappings SACP
2021-02-23 02:17:07,012 : INFO : processing from mappings SACP
2021-02-23 02:17:07,019 : INFO : processing from mappings SACP
2021-02-23 02:17:07,026 : INFO : processing from mappings SACP
2021-02-23 02:17:07,033 : INFO : processing from mappin

2021-02-23 02:17:07,855 : INFO : processing from mappings SACP
2021-02-23 02:17:07,862 : INFO : processing from mappings SACP
2021-02-23 02:17:07,870 : INFO : processing from mappings SACP
2021-02-23 02:17:07,877 : INFO : processing from mappings SACP
2021-02-23 02:17:07,885 : INFO : processing from mappings SACP
2021-02-23 02:17:07,892 : INFO : processing from mappings SACP
2021-02-23 02:17:07,899 : INFO : processing from mappings SACP
2021-02-23 02:17:07,907 : INFO : processing from mappings SACP
2021-02-23 02:17:07,914 : INFO : processing from mappings SACP
2021-02-23 02:17:07,921 : INFO : processing from mappings SACP
2021-02-23 02:17:07,928 : INFO : processing from mappings SACP
2021-02-23 02:17:07,934 : INFO : processing from mappings SACP
2021-02-23 02:17:07,941 : INFO : processing from mappings SACP
2021-02-23 02:17:07,947 : INFO : processing from mappings SACP
2021-02-23 02:17:07,955 : INFO : processing from mappings SACP
2021-02-23 02:17:07,961 : INFO : processing from mappin

2021-02-23 02:17:08,789 : INFO : processing from mappings SACP
2021-02-23 02:17:08,796 : INFO : processing from mappings SACP
2021-02-23 02:17:08,803 : INFO : processing from mappings SACP
2021-02-23 02:17:08,810 : INFO : processing from mappings SACP
2021-02-23 02:17:08,816 : INFO : processing from mappings SACP
2021-02-23 02:17:08,827 : INFO : processing from mappings SACP
2021-02-23 02:17:08,836 : INFO : processing from mappings SACP
2021-02-23 02:17:08,844 : INFO : processing from mappings SACP
2021-02-23 02:17:08,852 : INFO : processing from mappings SACP
2021-02-23 02:17:08,865 : INFO : processing from mappings SACP
2021-02-23 02:17:08,873 : INFO : processing from mappings SACP
2021-02-23 02:17:08,880 : INFO : processing from mappings SACP
2021-02-23 02:17:08,887 : INFO : processing from mappings SACP
2021-02-23 02:17:08,894 : INFO : processing from mappings SACP
2021-02-23 02:17:08,902 : INFO : processing from mappings SACP
2021-02-23 02:17:08,909 : INFO : processing from mappin

2021-02-23 02:17:09,749 : INFO : processing from mappings SACP
2021-02-23 02:17:09,757 : INFO : processing from mappings SACP
2021-02-23 02:17:09,764 : INFO : processing from mappings SACP
2021-02-23 02:17:09,771 : INFO : processing from mappings SACP
2021-02-23 02:17:09,778 : INFO : processing from mappings SACP
2021-02-23 02:17:09,785 : INFO : processing from mappings SACP
2021-02-23 02:17:09,792 : INFO : processing from mappings SACP
2021-02-23 02:17:09,800 : INFO : processing from mappings SACP
2021-02-23 02:17:09,807 : INFO : processing from mappings SACP
2021-02-23 02:17:09,814 : INFO : processing from mappings SACP
2021-02-23 02:17:09,820 : INFO : processing from mappings SACP
2021-02-23 02:17:09,827 : INFO : processing from mappings SACP
2021-02-23 02:17:09,834 : INFO : processing from mappings SACP
2021-02-23 02:17:09,841 : INFO : processing from mappings SACP
2021-02-23 02:17:09,848 : INFO : processing from mappings SACP
2021-02-23 02:17:09,855 : INFO : processing from mappin

2021-02-23 02:17:10,692 : INFO : processing from mappings SACP
2021-02-23 02:17:10,698 : INFO : processing from mappings SACP
2021-02-23 02:17:10,706 : INFO : processing from mappings SACP
2021-02-23 02:17:10,713 : INFO : processing from mappings SACP
2021-02-23 02:17:10,720 : INFO : processing from mappings SACP
2021-02-23 02:17:10,726 : INFO : processing from mappings SACP
2021-02-23 02:17:10,733 : INFO : processing from mappings SACP
2021-02-23 02:17:10,740 : INFO : processing from mappings SACP
2021-02-23 02:17:10,747 : INFO : processing from mappings SACP
2021-02-23 02:17:10,754 : INFO : processing from mappings SACP
2021-02-23 02:17:10,762 : INFO : processing from mappings SACP
2021-02-23 02:17:10,769 : INFO : processing from mappings SACP
2021-02-23 02:17:10,776 : INFO : processing from mappings SACP
2021-02-23 02:17:10,784 : INFO : processing from mappings SACP
2021-02-23 02:17:10,797 : INFO : processing from mappings SACP
2021-02-23 02:17:10,805 : INFO : processing from mappin

2021-02-23 02:17:11,625 : INFO : processing from mappings SACP
2021-02-23 02:17:11,632 : INFO : processing from mappings SACP
2021-02-23 02:17:11,639 : INFO : processing from mappings SACP
2021-02-23 02:17:11,645 : INFO : processing from mappings SACP
2021-02-23 02:17:11,652 : INFO : processing from mappings SACP
2021-02-23 02:17:11,659 : INFO : processing from mappings SACP
2021-02-23 02:17:11,666 : INFO : processing from mappings SACP
2021-02-23 02:17:11,673 : INFO : processing from mappings SACP
2021-02-23 02:17:11,681 : INFO : processing from mappings SACP
2021-02-23 02:17:11,688 : INFO : processing from mappings SACP
2021-02-23 02:17:11,695 : INFO : processing from mappings SACP
2021-02-23 02:17:11,702 : INFO : processing from mappings SACP
2021-02-23 02:17:11,709 : INFO : processing from mappings SACP
2021-02-23 02:17:11,716 : INFO : processing from mappings SACP
2021-02-23 02:17:11,724 : INFO : processing from mappings SACP
2021-02-23 02:17:11,731 : INFO : processing from mappin

2021-02-23 02:17:12,554 : INFO : processing from mappings SACP
2021-02-23 02:17:12,562 : INFO : processing from mappings SACP
2021-02-23 02:17:12,569 : INFO : processing from mappings SACP
2021-02-23 02:17:12,576 : INFO : processing from mappings SACP
2021-02-23 02:17:12,584 : INFO : processing from mappings SACP
2021-02-23 02:17:12,591 : INFO : processing from mappings SACP
2021-02-23 02:17:12,598 : INFO : processing from mappings SACP
2021-02-23 02:17:12,605 : INFO : processing from mappings SACP
2021-02-23 02:17:12,613 : INFO : processing from mappings SACP
2021-02-23 02:17:12,620 : INFO : processing from mappings SACP
2021-02-23 02:17:12,627 : INFO : processing from mappings SACP
2021-02-23 02:17:12,634 : INFO : processing from mappings SACP
2021-02-23 02:17:12,641 : INFO : processing from mappings SACP
2021-02-23 02:17:12,648 : INFO : processing from mappings SACP
2021-02-23 02:17:12,655 : INFO : processing from mappings SACP
2021-02-23 02:17:12,661 : INFO : processing from mappin

2021-02-23 02:17:13,534 : INFO : processing from mappings SACP
2021-02-23 02:17:13,541 : INFO : processing from mappings SACP
2021-02-23 02:17:13,548 : INFO : processing from mappings SACP
2021-02-23 02:17:13,556 : INFO : processing from mappings SACP
2021-02-23 02:17:13,563 : INFO : processing from mappings SACP
2021-02-23 02:17:13,570 : INFO : processing from mappings SACP
2021-02-23 02:17:13,577 : INFO : processing from mappings SACP
2021-02-23 02:17:13,584 : INFO : processing from mappings SACP
2021-02-23 02:17:13,591 : INFO : processing from mappings SACP
2021-02-23 02:17:13,598 : INFO : processing from mappings SACP
2021-02-23 02:17:13,605 : INFO : processing from mappings SACP
2021-02-23 02:17:13,613 : INFO : processing from mappings SACP
2021-02-23 02:17:13,620 : INFO : processing from mappings SACP
2021-02-23 02:17:13,627 : INFO : processing from mappings SACP
2021-02-23 02:17:13,634 : INFO : processing from mappings SACP
2021-02-23 02:17:13,642 : INFO : processing from mappin

2021-02-23 02:17:14,493 : INFO : processing from mappings SACP
2021-02-23 02:17:14,500 : INFO : processing from mappings SACP
2021-02-23 02:17:14,508 : INFO : processing from mappings SACP
2021-02-23 02:17:14,515 : INFO : processing from mappings SACP
2021-02-23 02:17:14,522 : INFO : processing from mappings SACP
2021-02-23 02:17:14,530 : INFO : processing from mappings SACP
2021-02-23 02:17:14,537 : INFO : processing from mappings SACP
2021-02-23 02:17:14,544 : INFO : processing from mappings SACP
2021-02-23 02:17:14,551 : INFO : processing from mappings SACP
2021-02-23 02:17:14,559 : INFO : processing from mappings SACP
2021-02-23 02:17:14,566 : INFO : processing from mappings SACP
2021-02-23 02:17:14,573 : INFO : processing from mappings SACP
2021-02-23 02:17:14,581 : INFO : processing from mappings SACP
2021-02-23 02:17:14,588 : INFO : processing from mappings SACP
2021-02-23 02:17:14,595 : INFO : processing from mappings SACP
2021-02-23 02:17:14,602 : INFO : processing from mappin

2021-02-23 02:17:15,435 : INFO : processing from mappings SACP
2021-02-23 02:17:15,441 : INFO : processing from mappings SACP
2021-02-23 02:17:15,448 : INFO : processing from mappings SACP
2021-02-23 02:17:15,473 : INFO : processing from mappings SACP
2021-02-23 02:17:15,484 : INFO : processing from mappings SACP
2021-02-23 02:17:15,490 : INFO : processing from mappings SACP
2021-02-23 02:17:15,497 : INFO : processing from mappings SACP
2021-02-23 02:17:15,505 : INFO : processing from mappings SACP
2021-02-23 02:17:15,512 : INFO : processing from mappings SACP
2021-02-23 02:17:15,519 : INFO : processing from mappings SACP
2021-02-23 02:17:15,526 : INFO : processing from mappings SACP
2021-02-23 02:17:15,533 : INFO : processing from mappings SACP
2021-02-23 02:17:15,540 : INFO : processing from mappings SACP
2021-02-23 02:17:15,547 : INFO : processing from mappings SACP
2021-02-23 02:17:15,553 : INFO : processing from mappings SACP
2021-02-23 02:17:15,560 : INFO : processing from mappin

2021-02-23 02:17:16,375 : INFO : processing from mappings SACP
2021-02-23 02:17:16,384 : INFO : processing from mappings SACP
2021-02-23 02:17:16,392 : INFO : processing from mappings SACP
2021-02-23 02:17:16,397 : INFO : processing from mappings SACP
2021-02-23 02:17:16,408 : INFO : processing from mappings SACP
2021-02-23 02:17:16,418 : INFO : processing from mappings SACP
2021-02-23 02:17:16,424 : INFO : processing from mappings SACP
2021-02-23 02:17:16,430 : INFO : processing from mappings SACP
2021-02-23 02:17:16,436 : INFO : processing from mappings SACP
2021-02-23 02:17:16,442 : INFO : processing from mappings SACP
2021-02-23 02:17:16,449 : INFO : processing from mappings SACP
2021-02-23 02:17:16,456 : INFO : processing from mappings SACP
2021-02-23 02:17:16,463 : INFO : processing from mappings SACP
2021-02-23 02:17:16,470 : INFO : processing from mappings SACP
2021-02-23 02:17:16,477 : INFO : processing from mappings SACP
2021-02-23 02:17:16,484 : INFO : processing from mappin

2021-02-23 02:17:17,307 : INFO : processing from mappings SACP
2021-02-23 02:17:17,314 : INFO : processing from mappings SACP
2021-02-23 02:17:17,321 : INFO : processing from mappings SACP
2021-02-23 02:17:17,328 : INFO : processing from mappings SACP
2021-02-23 02:17:17,335 : INFO : processing from mappings SACP
2021-02-23 02:17:17,342 : INFO : processing from mappings SACP
2021-02-23 02:17:17,349 : INFO : processing from mappings SACP
2021-02-23 02:17:17,356 : INFO : processing from mappings SACP
2021-02-23 02:17:17,363 : INFO : processing from mappings SACP
2021-02-23 02:17:17,370 : INFO : processing from mappings SACP
2021-02-23 02:17:17,377 : INFO : processing from mappings SACP
2021-02-23 02:17:17,384 : INFO : processing from mappings SACP
2021-02-23 02:17:17,391 : INFO : processing from mappings SACP
2021-02-23 02:17:17,398 : INFO : processing from mappings SACP
2021-02-23 02:17:17,405 : INFO : processing from mappings SACP
2021-02-23 02:17:17,413 : INFO : processing from mappin

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise,Linked?
0,295,sacp-python-common/sacp_python_common/auth_uti...,1.251702,0.444109,0.940516,0.059484,2.251629,1.266756,4.518397,6.905617,7.074863,4.349151,2.556466,0.169246,0.0
1,295,sacp-python-common/sacp_python_common/bandit/b...,1.260697,0.442341,0.948267,0.051733,2.251629,1.266756,4.518397,7.121928,7.362820,4.277506,2.844423,0.240891,0.0
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.253958,0.443664,0.944901,0.055099,2.500000,1.296814,4.518397,6.410990,6.507248,4.422139,1.988851,0.096258,1.0
3,295,sacp-python-common/sacp_python_common/cave/cav...,1.224535,0.449532,0.933030,0.066970,1.921928,1.214807,4.518397,6.077867,6.289934,4.306329,1.771537,0.212068,0.0
4,295,sacp-python-common/sacp_python_common/cave/cav...,1.225771,0.449282,0.922901,0.077099,2.000000,1.245112,4.518397,5.977547,6.207226,4.288719,1.688829,0.229678,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21307,2,sacp-python-common/test/python/third_party/tes...,1.241267,0.446176,0.923590,0.076410,1.000000,1.000000,4.518397,6.374522,6.640185,4.252734,2.121788,0.265663,0.0
21308,2,sacp-python-common/test/python/third_party/tes...,1.215125,0.451442,0.875721,0.124279,2.750000,1.322665,4.518397,6.731239,7.013031,4.236604,2.494634,0.281793,0.0
21309,2,sacp-python-common/test/python/third_party/tes...,1.153213,0.464422,0.868256,0.131744,2.947703,1.339310,4.518397,6.503741,6.701760,4.320379,2.183363,0.198018,0.0
21310,2,sacp-python-common/test/python/third_party/unu...,1.242090,0.446012,0.947109,0.052891,2.750000,1.322665,4.518397,6.334729,6.488707,4.364420,1.970309,0.153977,0.0


In [18]:
path_to_ground_truth =  parameters['path_mappings']

In [19]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links # A tuple

2021-02-23 02:19:30,149 : INFO : generating ground truth


[('210,', 'test/python/third_party/Corona_Report/license_Report.json')]

In [21]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

2021-02-23 02:19:47,814 : INFO : Saving in...../dvc-ds4se/metrics/traceability/experiments1.1.x/[sacp-python-common-VectorizationType.word2vec-LinkType.issue2src-True-1614046787.460911].csv


In [23]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = ds.mining.ir.LoadLinks(timestamp=1614046787.460911, params=parameters,grtruth = True, logging=logging)
df_glinks.head()

2021-02-23 02:20:11,825 : INFO : Loading computed links from... ../dvc-ds4se/metrics/traceability/experiments1.1.x/[sacp-python-common-VectorizationType.word2vec-LinkType.issue2src-True-1614046787.460911].csv
2021-02-23 02:20:11,888 : INFO : df_x.dtypesSource                        object
Target                        object
DistanceMetric.WMD           float64
SimilarityMetric.WMD_sim     float64
DistanceMetric.SCM           float64
SimilarityMetric.SCM_sim     float64
EntropyMetric.MSI_I          float64
EntropyMetric.MSI_X          float64
EntropyMetric.Entropy_src    float64
EntropyMetric.Entropy_tgt    float64
EntropyMetric.JI             float64
EntropyMetric.MI             float64
EntropyMetric.Loss           float64
EntropyMetric.Noise          float64
Linked?                      float64
dtype: object


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise,Linked?
0,295,sacp-python-common/sacp_python_common/auth_uti...,1.251702,0.444109,0.940516,0.059484,2.251629,1.266756,4.518397,6.905617,7.074863,4.349151,2.556466,0.169246,0.0
1,295,sacp-python-common/sacp_python_common/bandit/b...,1.260697,0.442341,0.948267,0.051733,2.251629,1.266756,4.518397,7.121928,7.36282,4.277506,2.844423,0.240891,0.0
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.253958,0.443664,0.944901,0.055099,2.5,1.296814,4.518397,6.41099,6.507248,4.422139,1.988851,0.096258,1.0
3,295,sacp-python-common/sacp_python_common/cave/cav...,1.224535,0.449532,0.93303,0.06697,1.921928,1.214807,4.518397,6.077867,6.289934,4.306329,1.771537,0.212068,0.0
4,295,sacp-python-common/sacp_python_common/cave/cav...,1.225771,0.449282,0.922901,0.077099,2.0,1.245112,4.518397,5.977547,6.207226,4.288719,1.688829,0.229678,0.0


### Generating Documentation

In [None]:
! nbdev_build_docs #<-------- [Activate when stable]

In [None]:
! nbdev_build_lib

In [None]:
from nbdev.export import notebook2script
notebook2script()

In [None]:
#! pip install -e .