In [None]:
# default_exp mining.ir

# Information Retrieval and Traceability Interfaces
> Implementing Common Information Retrieval Interfaces
> Author: @danaderp December 2020

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

In [None]:
# ! pip install -e . <----- Install in the console

In [None]:
#export
import numpy as np
import gensim
import pandas as pd
from itertools import product 
from random import sample 
import functools 
import os

In [None]:
#export
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora
from datetime import datetime
from enum import Enum, unique, auto
from ds4se.mgmnt.prep.conv import *

In [None]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
#export
from scipy.spatial import distance
from scipy.stats import pearsonr

In [None]:
#Export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

## Enums

In [None]:
#export
#@unique
class VectorizationType(Enum):
    word2vec = auto()
    doc2vec = auto()
    vsm2vec = auto()

In [None]:
VectorizationType.word2vec

<VectorizationType.word2vec: 1>

In [None]:
#export
#@unique
class DistanceMetric(Enum):
    WMD = auto()
    COS = auto()
    SCM = auto()
    EUC = auto()
    MAN = auto()

In [None]:
#export
#@unique
class SimilarityMetric(Enum):
    WMD_sim = auto()
    COS_sim = auto()
    SCM_sim = auto()
    EUC_sim = auto()
    MAN_sim = auto()
    Pearson = auto()

In [None]:
#export
class EntropyMetric(Enum):
    MSI_I = auto() #Minimum shared information Entropy
    MSI_X = auto() #Minimum shared information Extropy
    MI = auto() #Mutual information
    JI = auto() #Joint information
    Loss = auto() #Conditioned Entropy given the output I(x|y)
    Noise = auto() #Conditioned Entropy given the input I(y|x)
    Entropy_src = auto() #Self  Information src artifacts
    Entropy_tgt = auto() #Self Information target artifacts

In [None]:
#export
class SoftwareArtifacts(Enum):
    REQ = 'req'
    TC = 'tc'
    SRC = 'src'
    PY = 'py'
    PR = 'pr'
    UC = 'uc'

In [None]:
#export
#@unique
class Preprocessing(Enum):
    conv = auto()
    bpe = auto()

In [None]:
#export
#@unique
class LinkType(Enum):
    req2tc = auto()
    req2src = auto()
    issue2src = auto()
    pr2src = auto()
    uc2src = auto()
    uc2tc = auto()

In [None]:
#tst
LinkType.req2tc

<LinkType.req2tc: 1>

In [None]:
#tst
Preprocessing.bpe

<Preprocessing.bpe: 2>

## 1. Setting-Up Testing Environment

In [None]:
#hide
path_data = '../dvc-ds4se/' #dataset path

In [None]:
#hide
#experiment 0.0.0
#check it out in https://docs.google.com/spreadsheets/d/1UggaKFK8Qr5YltG_X9dN9BUlgH-GNiAbfPkcqSxjyoo/edit?usp=sharing
path_to_trained_model = path_data+'models/wv/bpe8k/[word2vec-Java-Py-SK-500-20E-8k-1594090297.869643].model'
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'

In [None]:
#hide
#experiment 0.0.0
#Experiment 1 with Libest Conv preprocessing
def libest_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2tc,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.REQ.value,
        "target_type": SoftwareArtifacts.TC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "saving_path": path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_model_prefix, #For BPE Analysis
        "path_mappings": path_data + 'se-benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt',    
    }

In [None]:
#Experiment 0.0.6
path_to_trained_model = path_data+'models/wv/conv/[word2vec-Py-Java-SK-500-20E-1592607739.629433].model'
def etour_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.uc2src,
        "system": 'etour',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.UC.value,
        "target_type": SoftwareArtifacts.SRC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/testbeds/processed/[etour-all-corpus-1609209368.279199].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "path_mappings": path_data + "se-benchmarking/traceability/testbeds/groundtruth/italian/[etour-ground-uc-to-src].txt",
        "saving_path": path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
    }

In [None]:
#Experiment 0.0.7
path_to_trained_model = path_data+'models/wv/conv/[word2vec-Py-Java-SK-500-20E-1592607739.629433].model'
def itrust_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.uc2src,
        "system": 'itrust',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.UC.value,
        "target_type": SoftwareArtifacts.SRC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/testbeds/processed/[itrust-all-corpus-1609210989.304283].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "path_mappings": path_data + "se-benchmarking/traceability/testbeds/groundtruth/english/[itrust-ground-uc-to-src].txt",
        "saving_path": path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
    }

In [None]:
#Experiments 1.0.2 <<-- word2vec
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'
path_to_trained_model = path_data+'/models/wv/bpe8k/[word2vec-Java-Py-SK-500-20E-8k-1594090297.869643].model'
def sacp_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.PR.value,
        "target_type": SoftwareArtifacts.PY.value,
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
            "sep": '~',
            "names": ['ids','bpe8k'],
            "prep": Preprocessing.bpe
        },
        "path_mappings": "/tf/data/cisco/sacp_data/sacp-pr-mappings.csv",
        "saving_path": path_data + 'metrics/traceability/experiments1.0.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix": path_model_prefix
        }

In [None]:
#Experiments 0.0.2 <<-- word2vec
path_to_trained_model = path_data+'/models/wv/conv/[word2vec-Py-Java-SK-500-20E-1592607739.629433].model'
def sacp_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.PR.value,
        "target_type": SoftwareArtifacts.PY.value,
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "path_mappings": "/tf/data/cisco/sacp_data/sacp-pr-mappings.csv",
        "saving_path": path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?']
        }

In [None]:
#hide
parameters = sacp_params()
parameters

{'vectorizationType': <VectorizationType.word2vec: 1>,
 'linkType': <LinkType.issue2src: 3>,
 'system': 'sacp-python-common',
 'path_to_trained_model': '../dvc-ds4se//models/wv/bpe8k/[word2vec-Java-Py-SK-500-20E-8k-1594090297.869643].model',
 'source_type': 'pr',
 'target_type': 'py',
 'system_path_config': {'system_path': '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
  'sep': '~',
  'names': ['ids', 'bpe8k'],
  'prep': <Preprocessing.bpe: 2>},
 'path_mappings': '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
 'saving_path': '../dvc-ds4se/metrics/traceability/experiments1.0.x/',
 'names': ['Source', 'Target', 'Linked?'],
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_8k'}

In [None]:
parameters['source_type']

'pr'

In [None]:
#tst
parameters['system_path_config']['system_path']

'/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv'

In [None]:
#tst
parameters['system_path_config']['names'][1]

'bpe8k'

In [None]:
parameters['system_path_config']['sep'] #tst

'~'

In [None]:
#hide
df_all_system = pd.read_csv(
            parameters['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = parameters['system_path_config']['sep'] 
        )

In [None]:
df_all_system.head(1)

Unnamed: 0,ids,text,type,conv,bpe8k,bpe32k,bpe128k
0,295,Production Merge * Feed release name through t...,pr,product merg feed releas name upload bom allow...,"['▁production', '▁mer', 'ge', '▁*', '▁feed', '...","['▁production', '▁merge', '▁*', '▁feed', '▁rel...","['▁production', '▁merge', '▁*', '▁feed', '▁rel..."


In [None]:
#hide
tag = parameters['system_path_config']['names'][1]
[doc.split() for doc in df_all_system[df_all_system[tag].notnull()][tag].values]

[["['▁production',",
  "'▁mer',",
  "'ge',",
  "'▁*',",
  "'▁feed',",
  "'▁release',",
  "'▁name',",
  "'▁through',",
  "'▁to',",
  "'▁up',",
  "'load',",
  "'b',",
  "'om',",
  "'▁(',",
  "'#',",
  "'29',",
  "'3',",
  "'▁)',",
  "'▁*',",
  "'▁allow',",
  "'▁app',",
  "'end',",
  "'▁images',",
  "'▁(',",
  "'#',",
  "'28',",
  "'7',",
  "'▁)',",
  "'▁*',",
  "'▁d',",
  "'are',",
  "'▁test',",
  "'▁fields',",
  "'▁(',",
  "'#',",
  "'29',",
  "'4)']"],
 ["['▁add',",
  "'▁test',",
  "'▁fields',",
  "'▁for',",
  "'▁d',",
  "'are',",
  "'▁p',",
  "'ush',",
  "'▁*',",
  "'▁added',",
  "'▁test',",
  "'▁data',",
  "'▁to',",
  "'▁the',",
  "'▁j',",
  "'son',",
  "'▁being',",
  "'▁sent',",
  "'▁to',",
  "'▁d',",
  "'are',",
  "'▁when',",
  "'▁running',",
  "'▁st',",
  "'atic',",
  "'▁code',",
  "'▁analysis',",
  "'.',",
  "'▁example',",
  "'▁p',",
  "'s',",
  "'b',",
  "'▁data',",
  "'▁below',",
  "'.',",
  "'▁clos',",
  "'es',",
  "'▁sac',",
  "'p',",
  "'/',",
  "'cs',",
  "'b',",
  "'-',",


In [None]:
len(df_all_system[tag].values) #tst

362

In [None]:
#tst
len(df_all_system[df_all_system[tag].notnull()]) #some files are _init_ thefore are empty

362

In [None]:
#tst
df_all_system[df_all_system[tag].notnull()][tag].values

array(["['▁production', '▁mer', 'ge', '▁*', '▁feed', '▁release', '▁name', '▁through', '▁to', '▁up', 'load', 'b', 'om', '▁(', '#', '29', '3', '▁)', '▁*', '▁allow', '▁app', 'end', '▁images', '▁(', '#', '28', '7', '▁)', '▁*', '▁d', 'are', '▁test', '▁fields', '▁(', '#', '29', '4)']",
       '[\'▁add\', \'▁test\', \'▁fields\', \'▁for\', \'▁d\', \'are\', \'▁p\', \'ush\', \'▁*\', \'▁added\', \'▁test\', \'▁data\', \'▁to\', \'▁the\', \'▁j\', \'son\', \'▁being\', \'▁sent\', \'▁to\', \'▁d\', \'are\', \'▁when\', \'▁running\', \'▁st\', \'atic\', \'▁code\', \'▁analysis\', \'.\', \'▁example\', \'▁p\', \'s\', \'b\', \'▁data\', \'▁below\', \'.\', \'▁clos\', \'es\', \'▁sac\', \'p\', \'/\', \'cs\', \'b\', \'-\', \'c\', \'ic\', \'d\', \'p\', \'ip\', \'eline\', \'ed\', \'ition\', \'#\', \'38\', \'1\', \'▁*\', \'▁added\', \'▁2\', \'▁additional\', \'▁sc\', \'f\', \'▁m\', \'app\', \'ings\', \'▁*\', \'▁[\', \'t\', \'est\', \'▁build\', \'s\', \']\', \'(\', \'h\', \'tt\', \'ps\', \'://\', \'eng\', \'ci\', \'-\',

In [None]:
#tst
df_all_system.loc[df_all_system['type'] == parameters['source_type']][parameters['system_path_config']['names']]

Unnamed: 0,ids,bpe8k
0,295,"['▁production', '▁mer', 'ge', '▁*', '▁feed', '..."
1,294,"['▁add', '▁test', '▁fields', '▁for', '▁d', 'ar..."
2,293,"['▁allow', '▁passing', '▁a', '▁release', '▁to'..."
3,287,"['▁allow', '▁app', 'end', '▁images', '▁#', '3'..."
4,274,"['▁move', '▁d', 'ock', 'er', '/', 'black', 'd'..."
...,...,...
283,7,"['▁upd', 'ate', '▁b', 'd', 'sc', 'an']"
284,4,"['▁syn', 'ch']"
285,5,"['▁syn', 'c']"
286,1,"['▁tem', 'por', 'arily', '▁dis', 'able', '▁gre..."


In [None]:
df_all_system.loc[df_all_system['type'] == parameters['target_type']][parameters['system_path_config']['names']]

Unnamed: 0,ids,bpe8k
1,sacp-python-common/sacp_python_common/auth_uti...,"['▁""', '""', '""', '\r\n', 'c', 're', 'ated', '▁..."
3,sacp-python-common/sacp_python_common/bandit/b...,"['▁#', '!', '/', 'us', 'r', '/', 'b', 'in', '/..."
4,sacp-python-common/sacp_python_common/bandit/b...,"['▁import', '▁j', 'son', '\r\n\r\n', 'from', '..."
6,sacp-python-common/sacp_python_common/cave/cav...,"['▁#', '!', '/', 'us', 'r', '/', 'b', 'in', '/..."
7,sacp-python-common/sacp_python_common/cave/cav...,"['▁#', '!', '/', 'us', 'r', '/', 'b', 'in', '/..."
...,...,...
92,sacp-python-common/test/python/third_party/tes...,"['▁import', '▁os', '\r\n', 'im', 'port', '▁un'..."
93,sacp-python-common/test/python/third_party/tes...,"['▁import', '▁os', '\r\n', 'im', 'port', '▁un'..."
94,sacp-python-common/test/python/third_party/tes...,"['▁import', '▁un', 'itt', 'est', '\r\n', 'from..."
95,sacp-python-common/test/python/third_party/unu...,"['▁#', '▁import', '▁j', 'son', '\r\n', '#', '▁..."


## 1. Defining BasicSequenceVectorization

In [None]:
#tst
print(list(VectorizationType), list(DistanceMetric), list(SimilarityMetric), list(LinkType))

[<VectorizationType.word2vec: 1>, <VectorizationType.doc2vec: 2>, <VectorizationType.vsm2vec: 3>] [<DistanceMetric.WMD: 1>, <DistanceMetric.COS: 2>, <DistanceMetric.SCM: 3>, <DistanceMetric.EUC: 4>, <DistanceMetric.MAN: 5>] [<SimilarityMetric.WMD_sim: 1>, <SimilarityMetric.COS_sim: 2>, <SimilarityMetric.SCM_sim: 3>, <SimilarityMetric.EUC_sim: 4>, <SimilarityMetric.MAN_sim: 5>, <SimilarityMetric.Pearson: 6>] [<LinkType.req2tc: 1>, <LinkType.req2src: 2>, <LinkType.issue2src: 3>, <LinkType.pr2src: 4>, <LinkType.uc2src: 5>, <LinkType.uc2tc: 6>]


In [None]:
#export
class BasicSequenceVectorization():
    '''Implementation of the class sequence-vanilla-vectorization other classes can inheritance this one'''
    def __init__(self, params, logging):
                
        self.params = params
        self.logging = logging
        self.df_nonground_link = None
        self.df_ground_link = None
        bpe = Preprocessing.bpe == self.params['system_path_config']['prep']
        self.prep = ConventionalPreprocessing(self.params, bpe = bpe)
        
        self.df_all_system = pd.read_csv(
            self.params['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = self.params['system_path_config']['sep'] 
        )
        
        #self.df_source = pd.read_csv(params['source_path'], names=['ids', 'text'], header=None, sep=' ')
        #self.df_target = pd.read_csv(params['target_path'], names=['ids', 'text'], header=None, sep=' ')
        self.df_source = self.df_all_system.loc[self.df_all_system['type'] == self.params['source_type']][self.params['system_path_config']['names']]
        self.df_target = self.df_all_system.loc[self.df_all_system['type'] == self.params['target_type']][self.params['system_path_config']['names']]
        
        #NA verification
        tag = self.params['system_path_config']['names'][1]
        self.df_source[tag] = self.df_source[tag].fillna("")
        self.df_target[tag] = self.df_target[tag].fillna("")
        
        ## self.document and self.dictionary is the vocabulary of the traceability corpus
        ## Do not confuse it with the dictionary of the general vectorization model
        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            self.documents = [doc.split() for doc in self.df_all_system[self.df_all_system[tag].notnull()][tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            self.vocab = dict.fromkeys( self.dictionary.token2id.keys(),0 )
            self.logging.info("conventional preprocessing documents, dictionary, and vocab for the test corpus")
        
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            self.documents = [eval(doc) for doc in self.df_all_system[tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            self.computing_bpe_vocab(tag=tag)
            self.logging.info("bpe preprocessing documents, dictionary, and vocab for the test corpus")
        
        
        #This can be extended for future metrics <---------------------
        self.dict_labels = {
            DistanceMetric.COS:[DistanceMetric.COS, SimilarityMetric.COS_sim],
            SimilarityMetric.Pearson:[SimilarityMetric.Pearson],
            DistanceMetric.EUC:[DistanceMetric.EUC, SimilarityMetric.EUC_sim],
            DistanceMetric.WMD:[DistanceMetric.WMD, SimilarityMetric.WMD_sim],
            DistanceMetric.SCM:[DistanceMetric.SCM, SimilarityMetric.SCM_sim],
            DistanceMetric.MAN:[DistanceMetric.MAN, SimilarityMetric.MAN_sim],
            EntropyMetric.MSI_I:[EntropyMetric.MSI_I, EntropyMetric.MSI_X],
            EntropyMetric.MI:[EntropyMetric.Entropy_src, EntropyMetric.Entropy_tgt,
                              EntropyMetric.JI, EntropyMetric.MI,
                              EntropyMetric.Loss, EntropyMetric.Noise
                             ]
        }

    def computing_bpe_vocab(self,tag):
        ####INFO science params
        abstracted_vocab = [ set( eval(doc) ) for doc in self.df_all_system[ tag ].values] #creation of sets
        abstracted_vocab = functools.reduce( lambda a,b : a.union(b), abstracted_vocab ) #union of sets
        self.vocab = {self.prep.sp_bpe.id_to_piece(id): 0 for id in range(self.prep.sp_bpe.get_piece_size())}
        dict_abs_vocab = { elem : 0 for elem in abstracted_vocab - set(self.vocab.keys()) } #Ignored vocab by BPE
        self.logging.info('Ignored vocab by BPE' + str(abstracted_vocab - set(self.vocab.keys())) )
        self.vocab.update(dict_abs_vocab) #Updating
    
    def ground_truth_processing(self, path_to_ground_truth = '', from_mappings = False):
        'Optional class when corpus has ground truth. This function create tuples of links'
        
        if from_mappings:
            df_mapping = pd.read_csv(self.params['path_mappings'], header = 0, sep = ',')
            ground_links = list(zip(df_mapping['id_pr'].astype(str), df_mapping['doc_id']))
            self.logging.info('ground truth from mappings')
        else:
            self.logging.info('generating ground truth')
            ground_truth = open(path_to_ground_truth,'r')
            #Organizing The Ground Truth under the given format
            ground_links = [ [(line.strip().split()[0], elem) for elem in line.strip().split()[1:]] for line in ground_truth]
            ground_links = functools.reduce(lambda a,b : a+b,ground_links) #reducing into one list
            #assert len(ground_links) ==  len(set(ground_links)) 
            #To Verify Redundancies in the file
            if len(ground_links) !=  len(set(ground_links)):
                ground_links = list(set(ground_links))
                self.logging.warning("-----WARNING!-------- Redundacy in the ground truth file")
        return ground_links
    
    def samplingLinks(self, sampling = False, samples = 10, basename = False):
        
        if basename:
            source = [os.path.basename(elem) for elem in self.df_source['ids'].values ] 
            target = [os.path.basename(elem) for elem in self.df_target['ids'].values ]
        else:
            source = self.df_source['ids'].values
            target = self.df_target['ids'].values

        if sampling:
            links = sample( list( product( source , target ) ), samples)
        else:
            links = list( product( source , target ))

        return links
    
    def cos_scipy(self, vector_v, vector_w):
        cos =  distance.cosine( vector_v, vector_w )
        return [cos, 1.-cos]
    
    def euclidean_scipy(self, vector_v, vector_w):
        dst = distance.euclidean(vector_v,vector_w)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def manhattan_scipy(self, vector_v, vector_w):
        dst = distance.cityblock(vector_v,vector_w)
        n = len(vector_v)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def pearson_abs_scipy(self, vector_v, vector_w):
        '''We are not sure that pearson correlation works well on doc2vec inference vectors'''
        #vector_v =  np.asarray(vector_v, dtype=np.float32)
        #vector_w =  np.asarray(vector_w, dtype=np.float32)
        #logging.info("pearson_abs_scipy"  + 'len: ' + str(len(vector_v)) + 'type: ' + str(type(vector_v)) )
        #logging.info("pearson_abs_scipy"  + 'len: ' + str(len(vector_w)) + 'type: ' + str(type(vector_w)) )
        corr, _ = pearsonr(vector_v, vector_w)
        return [abs(corr)] #Absolute value of the correlation
    

    def computeDistanceMetric(self, links, metric_list):
        '''Metric List Iteration''' 
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)
    
    def ComputeDistanceArtifacts(self, metric_list, sampling = False , samples = 10, basename = False):
        '''Activates Distance and Similarity Computations
        @metric_list if [] then Computes All metrics
        @sampling is False by the default
        @samples is the number of samples (or links) to be generated'''
        links_ = self.samplingLinks( sampling, samples, basename )
        
        docs, metric_labels = self.computeDistanceMetric( metric_list=metric_list, links=links_) #checkpoints
        self.df_nonground_link = pd.DataFrame(docs, columns =[self.params['names'][0], self.params['names'][1]]+ metric_labels) #Transforming into a Pandas
        self.logging.info("Non-groundtruth links computed")
        pass 
    
    
    def SaveLinks(self, grtruth=False, sep=' ', mode='a'):
        timestamp = datetime.timestamp(datetime.now())
        path_to_link = self.params['saving_path'] + '['+ self.params['system'] + '-' + str(self.params['vectorizationType']) + '-' + str(self.params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
        
        if grtruth:
            self.df_ground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        else:
            self.df_nonground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        
        self.logging.info('Saving in...' + path_to_link)
        pass
    
    def findDistInDF(self, g_tuple, from_mappings=False, semeru_format=False):
        '''Return the index values of the matched mappings
        .eq is used for Source since it must match the exact code to avoid number substrings
        for the target, the substring might works fine
        '/' is aggregated before the tuple to avoid matching more then one substring
        '''

        if from_mappings: #SACP Format
            self.logging.info('processing from mappings SACP')
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].eq(g_tuple[0]) ) & 
                 (self.df_ground_link["Target"].str.contains(g_tuple[1], regex=False))]
        elif semeru_format: #LibEST Format
            self.logging.info('processing from semeru_format LibEST')
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].str.contains('/' + g_tuple[0], regex=False) ) & 
                 (self.df_ground_link["Target"].str.contains('/' + g_tuple[1], regex=False))]
        else: #By Default use Semeru Format
            self.logging.info('processing by Default')
            dist = self.df_ground_link[self.df_ground_link[self.params['names'][0]].str.contains( g_tuple[0][:g_tuple[0].find('.')] + '-' ) 
                     & self.df_ground_link[self.params['names'][1]].str.contains(g_tuple[1][:g_tuple[1].find('.')]) ]
        return dist.index.values
    
        
    def MatchWithGroundTruth(self, path_to_ground_truth='', from_mappings=False, semeru_format=False ):
        self.df_ground_link = self.df_nonground_link.copy()
        self.df_ground_link[self.params['names'][2]] = 0
        
        matchGT = [ self.findDistInDF( g , from_mappings=from_mappings, semeru_format=semeru_format ) for g in self.ground_truth_processing(path_to_ground_truth,from_mappings)]
        matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
        new_column = pd.Series(np.full([len(matchGT)], 1 ), name=self.params['names'][2], index = matchGT)
        
        self.df_ground_link.update(new_column)
        self.logging.info("Groundtruth links computed")
        pass

### Testing BasicSequenceVectorization

In [None]:
general2vec =  BasicSequenceVectorization(params = parameters, logging =logging)

2021-01-26 01:04:29,920 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-26 01:04:30,004 : INFO : built Dictionary(2193 unique tokens: ['#', '28', '29', '3', '4)']...) from 362 documents (total 205581 corpus positions)
2021-01-26 01:04:30,247 : INFO : Ignored vocab by BPE{'`', '\\', '\r\n\r\n@', '\r\n', 'γ', '```', '\t', '^', '@', '\r\n\r\n', '\r\n\r\n\r\n'}
2021-01-26 01:04:30,251 : INFO : bpe preprocessing documents, dictionary, and vocab for the test corpus


In [None]:
general2vec.params['system_path_config']['names'][1]

'bpe8k'

In [None]:
general2vec.df_all_system.head()

Unnamed: 0,ids,text,type,conv,bpe8k,bpe32k,bpe128k
0,295,Production Merge * Feed release name through t...,pr,product merg feed releas name upload bom allow...,"['▁production', '▁mer', 'ge', '▁*', '▁feed', '...","['▁production', '▁merge', '▁*', '▁feed', '▁rel...","['▁production', '▁merge', '▁*', '▁feed', '▁rel..."
1,294,Add test fields for DARE push * Added test dat...,pr,add test field dare push test data json sent d...,"['▁add', '▁test', '▁fields', '▁for', '▁d', 'ar...","['▁add', '▁test', '▁fields', '▁for', '▁dare', ...","['▁add', '▁test', '▁fields', '▁for', '▁dare', ..."
2,293,"Allow passing a release to uploadBom by name, ...",pr,allow pass releas upload bom name rather chang...,"['▁allow', '▁passing', '▁a', '▁release', '▁to'...","['▁allow', '▁passing', '▁a', '▁release', '▁to'...","['▁allow', '▁passing', '▁a', '▁release', '▁to'..."
3,287,Allow append images #363 - Changed how image n...,pr,allow append imag chang imag name creat send c...,"['▁allow', '▁app', 'end', '▁images', '▁#', '3'...","['▁allow', '▁append', '▁images', '▁#3', '63', ...","['▁allow', '▁append', '▁images', '▁#3', '63', ..."
4,274,Move docker/blackduck test to slave 4,pr,move docker blackduck test slave,"['▁move', '▁d', 'ock', 'er', '/', 'black', 'd'...","['▁move', '▁dock', 'er', '/', 'black', 'd', 'u...","['▁move', '▁docker', '/', 'black', 'duck', '▁t..."


In [None]:
abstracted_vocab = [ set( eval(doc) ) for doc in general2vec.df_all_system[ 'bpe8k' ].values] #<<-- Only BPE

In [None]:
abstracted_vocab[1]

{'"',
 '",',
 '#',
 '(',
 ')',
 ',',
 '-',
 '-0',
 '-2',
 '.',
 '."',
 '/',
 '1',
 '13',
 '2',
 '20',
 '24',
 '38',
 '44',
 '5,',
 '59',
 '7',
 ':',
 '://',
 ':00',
 '[',
 ']',
 '_',
 '```',
 'ac',
 'act',
 'ae',
 'af',
 'age',
 'al',
 'als',
 'an',
 'ann',
 'app',
 'ar',
 'are',
 'art',
 'ash',
 'ass',
 'at',
 'ated',
 'ath',
 'atic',
 'atus',
 'av',
 'b',
 'bs',
 'c',
 'cher',
 'ci',
 'cl',
 'co',
 'com',
 'cs',
 'd',
 'de',
 'des',
 'dis',
 'e',
 'ec',
 'ed',
 'eline',
 'em',
 'en',
 'eng',
 'equ',
 'ers',
 'erv',
 'es',
 'ess',
 'est',
 'ets',
 'f',
 'h',
 'he',
 'ial',
 'ic',
 'id',
 'ie',
 'if',
 'ig',
 'ile',
 'in',
 'ing',
 'ings',
 'ins',
 'ion',
 'ip',
 'irect',
 'is',
 'ition',
 'ity',
 'j',
 'jo',
 'k',
 'kins',
 'ky',
 'l',
 'ld',
 'let',
 'll',
 'm',
 'name',
 'ob',
 'ok',
 'or',
 'ors',
 'os',
 'ot',
 'out',
 'ow',
 'p',
 'part',
 'pat',
 'ps',
 'put',
 'q',
 'r',
 're',
 'red',
 'res',
 'rew',
 'ription',
 'rit',
 's',
 'sc',
 'sec',
 'son',
 'sp',
 'ss',
 'st',
 't',
 

In [None]:
m = dict.fromkeys( general2vec.dictionary.token2id.keys(),0 ) #From traceability dataset!

In [None]:
n = general2vec.vocab

In [None]:
len(set(m.keys()))

2193

In [None]:
len(set(n.keys()))

8011

In [None]:
set(m.keys()) - set(n.keys())

set()

In [None]:
set(n.keys()) - set(m.keys())

{'lets',
 'uguese',
 '▁continuing',
 '▁hit',
 '▁recognized',
 'ones',
 '▁slow',
 '▁korean',
 '▁pig',
 '▁daughter',
 '▁closely',
 '▁toy',
 '▁protest',
 '▁squ',
 'ellig',
 '▁contemporary',
 '▁illustr',
 'owers',
 'english',
 '▁qualified',
 'zo',
 '▁producing',
 '▁sout',
 '▁contribut',
 'ster',
 '▁crit',
 'yers',
 'eds',
 'osa',
 '▁spread',
 '▁seventh',
 '▁tamil',
 '▁detroit',
 'polit',
 '▁leading',
 '▁hans',
 '▁derby',
 'unk',
 '▁credited',
 '▁came',
 '▁great',
 '▁trade',
 'la',
 '▁quarter',
 '<unk>',
 'ena',
 '▁empire',
 'caa',
 '▁ott',
 '▁cell',
 'ishop',
 '▁invol',
 '▁characteristics',
 'ause',
 '▁athlet',
 ',"',
 'bour',
 '▁kennedy',
 'ada',
 '▁500',
 '▁directors',
 'ried',
 'atriate',
 '▁upper',
 '▁sum',
 '▁thous',
 'inted',
 '▁artist',
 '▁elected',
 'fm',
 '▁switzerland',
 '▁spacewatch',
 '▁died',
 '▁young',
 '▁claim',
 'othing',
 'mar',
 'iden',
 '▁fighter',
 '▁california',
 '▁provides',
 '▁presence',
 'bon',
 '▁(,',
 '▁mach',
 '▁188',
 'jan',
 '▁says',
 'year',
 'ilies',
 '▁deput

In [None]:
len(set(m.keys()) - set(n.keys())) #TODO

0

In [None]:
assert len(set( m.keys()) - set(n.keys())) == 0 

In [None]:
general2vec.documents

[['▁production',
  '▁mer',
  'ge',
  '▁*',
  '▁feed',
  '▁release',
  '▁name',
  '▁through',
  '▁to',
  '▁up',
  'load',
  'b',
  'om',
  '▁(',
  '#',
  '29',
  '3',
  '▁)',
  '▁*',
  '▁allow',
  '▁app',
  'end',
  '▁images',
  '▁(',
  '#',
  '28',
  '7',
  '▁)',
  '▁*',
  '▁d',
  'are',
  '▁test',
  '▁fields',
  '▁(',
  '#',
  '29',
  '4)'],
 ['▁add',
  '▁test',
  '▁fields',
  '▁for',
  '▁d',
  'are',
  '▁p',
  'ush',
  '▁*',
  '▁added',
  '▁test',
  '▁data',
  '▁to',
  '▁the',
  '▁j',
  'son',
  '▁being',
  '▁sent',
  '▁to',
  '▁d',
  'are',
  '▁when',
  '▁running',
  '▁st',
  'atic',
  '▁code',
  '▁analysis',
  '.',
  '▁example',
  '▁p',
  's',
  'b',
  '▁data',
  '▁below',
  '.',
  '▁clos',
  'es',
  '▁sac',
  'p',
  '/',
  'cs',
  'b',
  '-',
  'c',
  'ic',
  'd',
  'p',
  'ip',
  'eline',
  'ed',
  'ition',
  '#',
  '38',
  '1',
  '▁*',
  '▁added',
  '▁2',
  '▁additional',
  '▁sc',
  'f',
  '▁m',
  'app',
  'ings',
  '▁*',
  '▁[',
  't',
  'est',
  '▁build',
  's',
  ']',
  '(',


In [None]:
len(general2vec.dictionary)

2193

In [None]:
general2vec.dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f452a02fbe0>

In [None]:
general2vec.df_all_system.head(1)

Unnamed: 0,ids,text,type,conv,bpe8k,bpe32k,bpe128k
0,295,Production Merge * Feed release name through t...,pr,product merg feed releas name upload bom allow...,"['▁production', '▁mer', 'ge', '▁*', '▁feed', '...","['▁production', '▁merge', '▁*', '▁feed', '▁rel...","['▁production', '▁merge', '▁*', '▁feed', '▁rel..."


In [None]:
general2vec.df_all_system.shape #data final tensor

(362, 7)

In [None]:
#tst for libest
path_to_ground_truth = parameters['path_mappings']
general2vec.ground_truth_processing(path_to_ground_truth)

2021-01-26 01:04:36,670 : INFO : generating ground truth


[('210,', 'test/python/third_party/Corona_Report/license_Report.json')]

In [None]:
#tst for sacp <----- Warning!
general2vec.ground_truth_processing(parameters['path_mappings'], from_mappings = True)

2021-01-26 01:04:44,290 : INFO : ground truth from mappings


[('295', 'sacp_python_common/bandit/banditReport.py'),
 ('295', 'sacp_python_common/csbcicd_report/csbcicd_func.py'),
 ('295', 'sacp_python_common/gosec/gosec_report.py'),
 ('295', 'sacp_python_common/psb_mapping.py'),
 ('295',
  'sacp_python_common/security_results_push/security_results_push_func.py'),
 ('295', 'sacp_python_common/spotbugs/spotbugs.py'),
 ('295', 'sacp_python_common/third_party/UploadBom.py'),
 ('295', 'sacp_python_common/third_party/binaryScan.py'),
 ('295', 'sacp_python_common/third_party/binary_scan_func.py'),
 ('295',
  'test/python/security_results_push/test_security_results_push_func.py'),
 ('295', 'test/python/third_party/temp/image_list.txt'),
 ('295', 'test/python/third_party/test_BinaryScan.py'),
 ('295', 'test/python/third_party/test_UploadBom.py'),
 ('295', 'test/python/third_party/test_binary_scan_func.py'),
 ('294', 'sacp_python_common/bandit/banditReport.py'),
 ('294', 'sacp_python_common/csbcicd_report/csbcicd_func.py'),
 ('294', 'sacp_python_common/go

## 2. Artifacts Similarity with Word2Vec

In [None]:
#export
from collections import Counter
import dit
import math

In [None]:
#export
class Word2VecSeqVect(BasicSequenceVectorization):       
    
    def __init__(self, params, logging):
        super().__init__(params, logging)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest 
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim,
            EntropyMetric.MSI_I: self.msi,
            EntropyMetric.MI: self.mutual_info
        }
    
    def wmd_gensim(self, sentence_a, sentence_b ):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]
    
    def wmd_similarity(self, dist):
        return 1./( 1.+float( dist ) ) #Associated Similarity
    
    def scm_gensim(self, sentence_a, sentence_b ):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)
        
        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True)
        return [1-scm_similarity, scm_similarity]
    
    def msi(self, sentence_a, sentence_b):
        '''@danaderp
        Minimum Shared Information'''
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        self.logging.info('token count processed')
        #Minimum Shared Tokens
        token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in vocab }
        
        alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ]
        frequencies = self.__get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ]
        self.logging.info('frequencies processed')
            
        if not frequencies:
            #"List is empty"
            "nan Means that src and target do not share information at all"
            entropies = float('nan')
            extropies = float('nan')
            self.logging.info('FREQUENCIES NOT COMPUTED!!!<--------------')
        else:
            scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )]
            self.logging.info('scalar_distribution processed')
            
            entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ]
            self.logging.info('entropies processed')
            
            extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ]
            self.logging.info('extropies processed')
        return [entropies,extropies]
    
    def mutual_info(self, sentence_a, sentence_b):
        """ Computing the manifold of metric of information
        Mutual information 
        Joint Information
        Conditioned Information Loss
        Conditioned Information Noise
        Self-Information
        """
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        self.logging.info('token count processed')

        self.logging.info('vocab #'+ str(len(self.vocab.keys())))
        
        alphabet_source = list(set(token_counts_1.keys()))
        self.logging.info('alphabet_source #'+ str(len(alphabet_source)) )
        
        alphabet_target = list(set(token_counts_2.keys()))
        self.logging.info('alphabet_target #'+ str(len(alphabet_target)) )
        
        
        self.logging.info('diff src2tgt #'+ str(set(token_counts_1.keys()) - set(token_counts_2.keys())))
        self.logging.info('diff tgt2src #'+ str(set(token_counts_2.keys()) - set(token_counts_1.keys())))
        
        assert( len(alphabet_source) ==  len(alphabet_target) )
        
        #Computing Self-Information (or Entropy)
        scalar_distribution_source = dit.ScalarDistribution(alphabet_source, self.__get_freqs( token_counts_1 ) )
        entropy_source = dit.shannon.entropy( scalar_distribution_source )
        
        scalar_distribution_target = dit.ScalarDistribution(alphabet_target, self.__get_freqs( token_counts_2 ) )
        entropy_target = dit.shannon.entropy( scalar_distribution_target )
        
        #Computing Joint-information
        token_counts = { token: (token_counts_1[token] + token_counts_2[token]) for token in vocab }
        alphabet = list(set(token_counts.keys()))
        self.logging.info('alphabet #'+ str(len(alphabet)))
        frequencies = self.__get_freqs(token_counts)
        ##WARNING! if a document is empty frequencies might create an issue!
        scalar_distribution = dit.ScalarDistribution(alphabet, frequencies)
        joint_entropy = dit.shannon.entropy( scalar_distribution )
        
        #Computing Mutual-Information
        mutual_information = entropy_source + entropy_target - joint_entropy
        
        #Computing Noise
        noise = joint_entropy - entropy_target
        
        #Computing Loss
        loss = joint_entropy - entropy_source
        
        return [entropy_source, entropy_target, joint_entropy, 
                mutual_information, loss, noise]
    
    def distance(self, metric_list,link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for 
        #the remaining metrics
        ids = self.params['system_path_config']['names'][0]
        txt = self.params['system_path_config']['names'][1]
        
        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            sentence_a = self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0].split()
            sentence_b = self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0].split()
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            sentence_a = eval(self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0])
            sentence_b = eval(self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0])
        
        dist = [ self.dict_distance_dispatcher[metric](sentence_a,sentence_b) for metric in metric_list]
        self.logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    #################################3TODO substitute this block in the future by importing information science module
    def __get_cnts(self, toks, vocab):
        '''@danaderp
        Counts tokens within ONE document'''
        #logging.info("encoding_size:" len
        cnt = Counter(vocab)
        for tok in toks:
            cnt[tok] += 1
        return cnt

    def __get_freqs(self, dict_token_counts):

        num_tokens = sum( dict_token_counts.values() ) #number of subwords inside the document
        if num_tokens == 0.0:
            frequencies = []
            self.logging.info('---------------> NO SHARED INFORMATION <-------------------------')
        else:
            frequencies = [ (dict_token_counts[token])/num_tokens for token in dict_token_counts ]
        return frequencies
    #################################3


In [None]:
#export
def LoadLinks(timestamp, params, logging, grtruth=False, sep=' ' ):
    '''Returns a pandas from a saved link computation at a give timestamp
    @timestamp is the version of the model for a given system'''
    
    path= params['saving_path'] + '['+ params['system'] + '-' + str(params['vectorizationType']) + '-' + str(params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
    
    logging.info("Loading computed links from... "+ path)
    

    df_load = pd.read_csv(path, header=0, index_col=0, sep=sep)
    df_load["Source"] = df_load.Source.astype(str)
    logging.info("df_x.dtypes" + str(df_load.dtypes))
    return df_load

### Testing Word2Vec SequenceVectorization

In [None]:
#hide
#tst
metric_list = ['a','b']
A = [[1,3,4],[4,5],[1,8,9,7]]
B = ((1,3,4),(4,5),(1,8,9,7))
functools.reduce(lambda a,b : a+b, B)
dist_sim_T = [([12,13],['metric1','metric2']),([12,13],['metric1','metric2'])]
dist_sim_T
separated_merged_list_a = functools.reduce(lambda a,b : a[1]+b[1], dist_sim_T)
separated_merged_list_a

['metric1', 'metric2', 'metric1', 'metric2']

#### [step 1]Creating the Vectorization Class

In [None]:
#[step 1]Creating the Vectorization Class
word2vec = Word2VecSeqVect( params = parameters, logging = logging )

2021-01-26 01:05:04,007 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-26 01:05:04,091 : INFO : built Dictionary(2193 unique tokens: ['#', '28', '29', '3', '4)']...) from 362 documents (total 205581 corpus positions)
2021-01-26 01:05:04,335 : INFO : Ignored vocab by BPE{'`', '\\', '\r\n\r\n@', '\r\n', 'γ', '```', '\t', '^', '@', '\r\n\r\n', '\r\n\r\n\r\n'}
2021-01-26 01:05:04,339 : INFO : bpe preprocessing documents, dictionary, and vocab for the test corpus
2021-01-26 01:05:04,342 : INFO : loading Word2Vec object from ../dvc-ds4se//models/wv/bpe8k/[word2vec-Java-Py-SK-500-20E-8k-1594090297.869643].model
2021-01-26 01:05:04,568 : INFO : loading wv recursively from ../dvc-ds4se//models/wv/bpe8k/[word2vec-Java-Py-SK-500-20E-8k-1594090297.869643].model.wv.* with mmap=None
2021-01-26 01:05:04,572 : INFO : setting ignored attribute vectors_norm to None
2021-01-26 01:05:04,576 : INFO : loading vocabulary recursively from ../dvc-ds4se//models/wv/bpe8k/[word2vec-Java-Py

In [None]:
len(word2vec.new_model.wv.vocab)

7888

In [None]:
word2vec.df_source['ids'][35]

'255'

In [None]:
word2vec.df_source['ids'][35] #In LIBEST REQ starts at 35

'255'

In [None]:
ids = parameters['system_path_config']['names'][0]
txt = parameters['system_path_config']['names'][1]
print(ids,txt)

ids bpe8k


In [None]:
idss = word2vec.df_source[ids][35] #Selecting an ID
idss = word2vec.df_source[ids] == idss #Search for an specific ID
list(word2vec.df_source[idss][txt])[0].split() #Retrieving text and splitting

["['▁upd',",
 "'ate',",
 "'▁cor',",
 "'ona',",
 "'▁bom',",
 "'▁end',",
 "'point',",
 "'▁#',",
 "'25',",
 "'4',",
 "'▁cor',",
 "'ona',",
 "'▁upd',",
 "'ated',",
 "'▁change',",
 "'▁the',",
 "'▁ap',",
 "'i',",
 "'▁to',",
 "'▁get',",
 "'▁the',",
 "'▁bom',",
 "'▁for',",
 "'▁a',",
 "'▁release',",
 "'.',",
 "'▁the',",
 "'▁new',",
 "'▁end',",
 "'point',",
 "'▁is',",
 "'▁',",
 "'`',",
 "'/',",
 "'rele',",
 "'ase',",
 "'/',",
 "':',",
 "'id',",
 "'/',",
 "'aim',",
 "'_',",
 "'b',",
 "'om',",
 "'_',",
 "'re',",
 "'port',",
 "'.',",
 "'j',",
 "'son',",
 "'`',",
 "'.',",
 "'▁changed',",
 "'▁the',",
 "'▁getting',",
 "'▁the',",
 "'▁bom',",
 "'▁to',",
 "'▁use',",
 "'▁the',",
 "'▁new',",
 "'▁end',",
 "'point',",
 "'▁and',",
 "'▁other',",
 "'▁code',",
 "'▁to',",
 "'▁par',",
 "'se',",
 "'▁the',",
 "'▁new',",
 "'▁format',",
 "'.',",
 "'▁the',",
 "'▁new',",
 "'▁end',",
 "'point',",
 "'▁returns',",
 "'▁the',",
 "'▁bom',",
 "'▁format',",
 "'▁differ',",
 "'ently',",
 "'.',",
 "'▁so',",
 "'▁if',",
 "'▁the',",


In [None]:
word2vec.df_source.head(2)

Unnamed: 0,ids,bpe8k
0,295,"['▁production', '▁mer', 'ge', '▁*', '▁feed', '..."
1,294,"['▁add', '▁test', '▁fields', '▁for', '▁d', 'ar..."


In [None]:
word2vec.df_target.head(2)

Unnamed: 0,ids,bpe8k
1,sacp-python-common/sacp_python_common/auth_uti...,"['▁""', '""', '""', '\r\n', 'c', 're', 'ated', '▁..."
3,sacp-python-common/sacp_python_common/bandit/b...,"['▁#', '!', '/', 'us', 'r', '/', 'b', 'in', '/..."


In [None]:
links = word2vec.samplingLinks(sampling=True, samples = 2)
links

[('19', 'sacp-python-common/sacp_python_common/cave/caveSsl.py'),
 ('176', 'sacp-python-common/sacp_python_common/third_party/tpsd_triage.py')]

In [None]:
print( len(links), word2vec.df_source.shape, word2vec.df_target.shape )

2 (288, 2) (74, 2)


In [None]:
links[0][0]

'19'

In [None]:
#tst conventioanal
word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0].split() #conventioanal
#eval(word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0]) #BPE

In [None]:
#tst bpe
eval(word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0]) #BPE

['▁build', '▁contain', 'ers', '▁on', '▁prod', '▁mer', 'ge']

In [None]:
#tst
word2vec.df_target[word2vec.df_target[ids].str.contains(links[0][1])][txt].values[0].split()

["['▁#',",
 "'!',",
 "'/',",
 "'us',",
 "'r',",
 "'/',",
 "'b',",
 "'in',",
 "'/',",
 "'en',",
 "'v',",
 "'▁py',",
 "'th',",
 "'on',",
 "'3',",
 "'\\r\\n',",
 "'im',",
 "'port',",
 "'▁j',",
 "'son',",
 "'\\r\\n',",
 "'im',",
 "'port',",
 "'▁os',",
 "'.',",
 "'p',",
 "'ath',",
 "'\\r\\n',",
 "'from',",
 "'▁date',",
 "'time',",
 "'▁import',",
 "'▁date',",
 "'time',",
 "'\\r\\n\\r\\n\\r\\n',",
 "'def',",
 "'▁process',",
 "'_',",
 "'c',",
 "'ave',",
 "'_',",
 "'ss',",
 "'l',",
 "'(',",
 "'ss',",
 "'l',",
 "'_',",
 "'d',",
 "'ir',",
 "'):',",
 "'\\r\\n',",
 "'▁s',",
 "'sl',",
 "'_',",
 "'res',",
 "'ult',",
 "'s',",
 "'_',",
 "'p',",
 "'ath',",
 "'▁=',",
 "'▁os',",
 "'.',",
 "'p',",
 "'ath',",
 "'.',",
 "'jo',",
 "'in',",
 "'(',",
 "'ss',",
 "'l',",
 "'_',",
 "'d',",
 "'ir',",
 "',',",
 '\'▁"\',',
 "'c',",
 "'ave',",
 "'_',",
 "'ss',",
 "'l',",
 "'_',",
 "'res',",
 "'ult',",
 "'s',",
 "'.',",
 "'j',",
 "'son',",
 '\'")\',',
 "'\\r\\n\\r\\n',",
 "'▁#',",
 "'▁*',",
 "'*',",
 "'*',",
 "'to',",


In [None]:
metric_list = [DistanceMetric.WMD,DistanceMetric.SCM,EntropyMetric.MSI_I,EntropyMetric.MI]
#metric_list = [EntropyMetric.MSI_I,EntropyMetric.MI]

In [None]:
#[optional] computeDistanceMetric Testing [WARNING!] Time Consuming!!
computeDistanceMetric = word2vec.computeDistanceMetric(links, metric_list = metric_list )
computeDistanceMetric

2021-01-26 01:07:19,704 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-26 01:07:19,712 : INFO : built Dictionary(132 unique tokens: ['ers', 'ge', '▁build', '▁contain', '▁mer']...) from 2 documents (total 519 corpus positions)
2021-01-26 01:07:19,744 : INFO : token count processed
2021-01-26 01:07:19,752 : INFO : ---------------> NO SHARED INFORMATION <-------------------------
2021-01-26 01:07:19,755 : INFO : frequencies processed
2021-01-26 01:07:19,758 : INFO : FREQUENCIES NOT COMPUTED!!!<--------------
2021-01-26 01:07:19,762 : INFO : token count processed
2021-01-26 01:07:19,765 : INFO : vocab #8011
2021-01-26 01:07:19,769 : INFO : alphabet_source #8011
2021-01-26 01:07:19,772 : INFO : alphabet_target #8011
2021-01-26 01:07:19,776 : INFO : diff src2tgt #set()
2021-01-26 01:07:19,780 : INFO : diff tgt2src #set()
2021-01-26 01:07:21,400 : INFO : alphabet #8011
2021-01-26 01:07:22,207 : INFO : Computed distances or similarities ('19', 'sacp-python-common/sacp_p

([['19',
   'sacp-python-common/sacp_python_common/cave/caveSsl.py',
   1.285062188933183,
   0.4376248510185474,
   0.9664333909749985,
   0.03356661,
   nan,
   nan,
   2.807354922057604,
   5.941919007331087,
   6.002755529424386,
   2.7465183999643052,
   3.1954006073667816,
   0.060836522093298484],
  ['176',
   'sacp-python-common/sacp_python_common/third_party/tpsd_triage.py',
   1.250651290112464,
   0.44431583177420186,
   0.9243483766913414,
   0.07565162,
   0.0,
   0.0,
   2.321928094887362,
   6.517838464869326,
   6.547472907987606,
   2.292293651769082,
   4.225544813100244,
   0.029634443118279563]],
 [<DistanceMetric.WMD: 1>,
  <SimilarityMetric.WMD_sim: 1>,
  <DistanceMetric.SCM: 3>,
  <SimilarityMetric.SCM_sim: 3>,
  <EntropyMetric.MSI_I: 1>,
  <EntropyMetric.MSI_X: 2>,
  <EntropyMetric.Entropy_src: 7>,
  <EntropyMetric.Entropy_tgt: 8>,
  <EntropyMetric.JI: 4>,
  <EntropyMetric.MI: 3>,
  <EntropyMetric.Loss: 5>,
  <EntropyMetric.Noise: 6>])

#### [step 2]NonGroundTruth Computation

In [None]:
word2vec.ComputeDistanceArtifacts( sampling=True, samples = 20, metric_list = metric_list )
word2vec.df_nonground_link.head()

2021-01-26 01:07:59,596 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-26 01:07:59,600 : INFO : built Dictionary(225 unique tokens: ['"', '/', '_', 'are', 'ate']...) from 2 documents (total 1159 corpus positions)
2021-01-26 01:07:59,779 : INFO : token count processed
2021-01-26 01:07:59,789 : INFO : frequencies processed
2021-01-26 01:08:00,628 : INFO : scalar_distribution processed
2021-01-26 01:08:00,629 : INFO : entropies processed
2021-01-26 01:08:00,630 : INFO : extropies processed
2021-01-26 01:08:00,632 : INFO : token count processed
2021-01-26 01:08:00,633 : INFO : vocab #8011
2021-01-26 01:08:00,635 : INFO : alphabet_source #8011
2021-01-26 01:08:00,636 : INFO : alphabet_target #8011
2021-01-26 01:08:00,638 : INFO : diff src2tgt #set()
2021-01-26 01:08:00,640 : INFO : diff tgt2src #set()
2021-01-26 01:08:02,269 : INFO : alphabet #8011
2021-01-26 01:08:03,096 : INFO : Computed distances or similarities ('187', 'sacp-python-common/test/python/harden_check

2021-01-26 01:08:21,307 : INFO : diff src2tgt #set()
2021-01-26 01:08:21,309 : INFO : diff tgt2src #set()
2021-01-26 01:08:23,010 : INFO : alphabet #8011
2021-01-26 01:08:23,795 : INFO : Computed distances or similarities ('217', 'sacp-python-common/test/python/spotbugs/test_spotbugsdisplay.py')[[1.0291463690538367, 0.4928180713086195], [0.622818112373352, 0.3771819], [3.2841837197791888, 1.3555518397614796], [5.2280188408047845, 6.443188759664073, 6.681405136698236, 4.989802463770622, 1.4533862958934511, 0.23821637703416254]]
2021-01-26 01:08:23,803 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-26 01:08:23,806 : INFO : built Dictionary(343 unique tokens: ['ass', 'igned', 'isc', 'r', '▁fix']...) from 2 documents (total 3788 corpus positions)
2021-01-26 01:08:23,908 : INFO : token count processed
2021-01-26 01:08:23,924 : INFO : frequencies processed
2021-01-26 01:08:24,743 : INFO : scalar_distribution processed
2021-01-26 01:08:24,744 : INFO : entropies process

2021-01-26 01:08:45,969 : INFO : built Dictionary(514 unique tokens: ['en', 'er', 'f', 'i', 'ile']...) from 2 documents (total 8349 corpus positions)
2021-01-26 01:08:46,221 : INFO : token count processed
2021-01-26 01:08:46,232 : INFO : frequencies processed
2021-01-26 01:08:47,061 : INFO : scalar_distribution processed
2021-01-26 01:08:47,062 : INFO : entropies processed
2021-01-26 01:08:47,063 : INFO : extropies processed
2021-01-26 01:08:47,068 : INFO : token count processed
2021-01-26 01:08:47,069 : INFO : vocab #8011
2021-01-26 01:08:47,070 : INFO : alphabet_source #8011
2021-01-26 01:08:47,072 : INFO : alphabet_target #8011
2021-01-26 01:08:47,074 : INFO : diff src2tgt #set()
2021-01-26 01:08:47,076 : INFO : diff tgt2src #set()
2021-01-26 01:08:48,660 : INFO : alphabet #8011
2021-01-26 01:08:49,453 : INFO : Computed distances or similarities ('258', 'sacp-python-common/sacp_python_common/third_party/HubRestApi.py')[[1.1958587230873916, 0.455402703956288], [0.8090700656175613, 0.

2021-01-26 01:09:08,346 : INFO : alphabet #8011
2021-01-26 01:09:09,140 : INFO : Computed distances or similarities ('201', 'sacp-python-common/test/python/third_party/test_blackduck.py')[[1.0363861178803122, 0.49106600718772664], [0.530375599861145, 0.4696244], [3.642026690518651, 1.371660349371635], [5.148595345844925, 6.509540793861948, 6.648598926877863, 5.009537212829009, 1.5000035810329377, 0.13905813301591508]]
2021-01-26 01:09:09,142 : INFO : Non-groundtruth links computed


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
0,187,sacp-python-common/test/python/harden_check/te...,1.044296,0.489166,0.560954,0.439046,3.787144,1.382694,4.814587,6.734422,6.824643,4.724367,2.010056,0.09022
1,34,sacp-python-common/test/python/third_party/tes...,1.170493,0.460725,0.76935,0.23065,2.921928,1.335902,4.6875,6.409538,6.482462,4.614577,1.794962,0.072923
2,255,sacp-python-common/test/python/csbcicdReport/t...,1.048748,0.488103,0.694367,0.305633,4.218172,1.373094,5.677007,6.51554,6.649447,5.5431,0.97244,0.133907
3,153,sacp-python-common/test/python/csbcicdReport/t...,1.235804,0.447266,0.909693,0.090307,1.584963,1.169925,3.0,6.51554,6.52751,2.98803,3.52751,0.01197
4,130,sacp-python-common/sacp_python_common/harden_c...,1.267184,0.441076,0.952635,0.047365,1.0,1.0,2.0,6.941677,6.949426,1.992251,4.949426,0.007749


In [None]:
word2vec.df_nonground_link.head()

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
0,187,sacp-python-common/test/python/harden_check/te...,1.044296,0.489166,0.560954,0.439046,3.787144,1.382694,4.814587,6.734422,6.824643,4.724367,2.010056,0.09022
1,34,sacp-python-common/test/python/third_party/tes...,1.170493,0.460725,0.76935,0.23065,2.921928,1.335902,4.6875,6.409538,6.482462,4.614577,1.794962,0.072923
2,255,sacp-python-common/test/python/csbcicdReport/t...,1.048748,0.488103,0.694367,0.305633,4.218172,1.373094,5.677007,6.51554,6.649447,5.5431,0.97244,0.133907
3,153,sacp-python-common/test/python/csbcicdReport/t...,1.235804,0.447266,0.909693,0.090307,1.584963,1.169925,3.0,6.51554,6.52751,2.98803,3.52751,0.01197
4,130,sacp-python-common/sacp_python_common/harden_c...,1.267184,0.441076,0.952635,0.047365,1.0,1.0,2.0,6.941677,6.949426,1.992251,4.949426,0.007749


In [None]:
word2vec.df_nonground_link['Source'].values

'187'

In [None]:
#tst 
#df_mapping = pd.read_csv(parameters['path_mappings'], header = 0, sep = ',')
ground_links = word2vec.ground_truth_processing(from_mappings='True') #<---- SACP
#ground_links = word2vec.ground_truth_processing(path_to_ground_truth) #<---- LIBEST
ground_links

2021-01-26 01:10:15,592 : INFO : ground truth from mappings


[('295', 'sacp_python_common/bandit/banditReport.py'),
 ('295', 'sacp_python_common/csbcicd_report/csbcicd_func.py'),
 ('295', 'sacp_python_common/gosec/gosec_report.py'),
 ('295', 'sacp_python_common/psb_mapping.py'),
 ('295',
  'sacp_python_common/security_results_push/security_results_push_func.py'),
 ('295', 'sacp_python_common/spotbugs/spotbugs.py'),
 ('295', 'sacp_python_common/third_party/UploadBom.py'),
 ('295', 'sacp_python_common/third_party/binaryScan.py'),
 ('295', 'sacp_python_common/third_party/binary_scan_func.py'),
 ('295',
  'test/python/security_results_push/test_security_results_push_func.py'),
 ('295', 'test/python/third_party/temp/image_list.txt'),
 ('295', 'test/python/third_party/test_BinaryScan.py'),
 ('295', 'test/python/third_party/test_UploadBom.py'),
 ('295', 'test/python/third_party/test_binary_scan_func.py'),
 ('294', 'sacp_python_common/bandit/banditReport.py'),
 ('294', 'sacp_python_common/csbcicd_report/csbcicd_func.py'),
 ('294', 'sacp_python_common/go

In [None]:
len(ground_links)

2672

In [None]:
#tst from non-links file
df_x = LoadLinks(timestamp=1610985085.620692, params=parameters,logging=logging)
df_x.head(1)

2021-01-26 01:12:03,317 : INFO : Loading computed links from... ../dvc-ds4se/metrics/traceability/experiments1.0.x/[sacp-python-common-VectorizationType.word2vec-LinkType.issue2src-False-1610985085.620692].csv


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
0,295,sacp-python-common/sacp_python_common/auth_uti...,1.181488,0.458403,0.75325,0.24675,3.022055,1.335963,4.715814,6.895876,6.97673,4.63496,2.260916,0.080854


In [None]:
df_x["Source"] = df_x.Source.astype(str)

In [None]:
df_x.dtypes

Source                        object
Target                        object
DistanceMetric.WMD           float64
SimilarityMetric.WMD_sim     float64
DistanceMetric.SCM           float64
SimilarityMetric.SCM_sim     float64
EntropyMetric.MSI_I          float64
EntropyMetric.MSI_X          float64
EntropyMetric.Entropy_src    float64
EntropyMetric.Entropy_tgt    float64
EntropyMetric.JI             float64
EntropyMetric.MI             float64
EntropyMetric.Loss           float64
EntropyMetric.Noise          float64
dtype: object

In [None]:
word2vec.df_ground_link = df_x.copy()
word2vec.df_nonground_link = df_x.copy()

In [None]:
word2vec.df_nonground_link.size

298368

In [None]:
#tst
df_x = word2vec.df_nonground_link
df_x.head()

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
0,295,sacp-python-common/sacp_python_common/auth_uti...,1.181488,0.458403,0.75325,0.24675,3.022055,1.335963,4.715814,6.895876,6.97673,4.63496,2.260916,0.080854
1,295,sacp-python-common/sacp_python_common/bandit/b...,1.211196,0.452244,0.770373,0.229627,3.084963,1.348006,4.715814,7.04681,7.10597,4.656654,2.390156,0.059159
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.212716,0.451933,0.790702,0.209298,2.855389,1.325019,4.715814,6.426571,6.493124,4.649261,1.77731,0.066553
3,295,sacp-python-common/sacp_python_common/cave/cav...,1.186251,0.457404,0.783486,0.216514,2.725481,1.31922,4.715814,6.09807,6.263189,4.550695,1.547375,0.165119
4,295,sacp-python-common/sacp_python_common/cave/cav...,1.182583,0.458173,0.776,0.224,2.584963,1.315172,4.715814,5.941919,6.18424,4.473493,1.468426,0.242321


In [None]:
df_x['Source'].values[1]

'295'

In [None]:
df_x['Target'].values[1]

'sacp-python-common/sacp_python_common/bandit/bandit.py'

In [None]:
#tst
test_source = 'UC23E2.txt' #'UC10.TXT'
test_target = '/WebRoot/util/getUser.jsp'#'RicercaStandard.java'

In [None]:
df_tempt = df_x[ df_x["Target"].str.contains('hardenPostCheck.py', regex=False) == True]

In [None]:
df_tempt.values

array([['295',
        'sacp-python-common/sacp_python_common/harden_check/hardenPostCheck.py',
        1.1470317253089668, ..., 4.647072161983568, 2.294605292554234,
        0.06874167644294182],
       ['294',
        'sacp-python-common/sacp_python_common/harden_check/hardenPostCheck.py',
        0.7819205464154058, ..., 6.6256279744239555, 0.31604948011384426,
        0.3439879973191555],
       ['293',
        'sacp-python-common/sacp_python_common/harden_check/hardenPostCheck.py',
        1.0635950848408204, ..., 6.246115041643853, 0.6955624128939464,
        0.2985984635755212],
       ...,
       ['5',
        'sacp-python-common/sacp_python_common/harden_check/hardenPostCheck.py',
        1.1470317253089668, ..., 4.647072161983568, 2.294605292554234,
        0.06874167644294182],
       ['1',
        'sacp-python-common/sacp_python_common/harden_check/hardenPostCheck.py',
        1.0331418113692776, ..., 5.952211171876483, 0.9894662826613184,
        0.2376879861135617],
     

In [None]:
df_x[ df_x["Source"].str.contains('UC23E2.txt', regex=False) == True]

In [None]:
df_x["Source"][1]

In [None]:
#TST for SACP
ground_links[0][0]

'295'

In [None]:
#TST for SACP
ground_links[0][1]

'sacp_python_common/bandit/banditReport.py'

In [None]:
#TST for SACP
test_source = '295' #'UC10.TXT'
test_target = 'sacp_python_common/bandit/banditReport.py'

In [None]:
#TST for SACP
df_x[( df_x["Source"].eq(test_source))& 
                 (df_x["Target"].str.contains( test_target, regex=False))]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.212716,0.451933,0.790702,0.209298,2.855389,1.325019,4.715814,6.426571,6.493124,4.649261,1.77731,0.066553


In [None]:
df_x["Target"].values

array(['sacp-python-common/sacp_python_common/auth_utility.py',
       'sacp-python-common/sacp_python_common/bandit/bandit.py',
       'sacp-python-common/sacp_python_common/bandit/banditReport.py',
       ...,
       'sacp-python-common/test/python/third_party/test_UploadBom.py',
       'sacp-python-common/test/python/third_party/unused/test_bd_image.py',
       'sacp-python-common/test/python/third_party/unused/test_ipc.py'],
      dtype=object)

In [None]:
df_x["Target"].str.contains('hardenPostCheck.py',regex=False)

0        False
1        False
2        False
3        False
4        False
         ...  
21307    False
21308    False
21309    False
21310    False
21311    False
Name: Target, Length: 21312, dtype: bool

In [None]:
#tst
df_x[( df_x["Source"].str.contains(ground_links[0][0]) ) & (df_x["Target"].str.contains(ground_links[0][1], regex=False))]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.212716,0.451933,0.790702,0.209298,2.855389,1.325019,4.715814,6.426571,6.493124,4.649261,1.77731,0.066553


In [None]:
def find_index_gt( g_tuple, from_mappings=False, semeru_format=False):
        '''Return the index values of the matched mappings
        .eq is used for Source since it must match the exact code to avoid number substrings
        for the target, the substring might works fine
        '/' is aggregated before the tuple to avoid matching more then one substring
        '''

        if from_mappings: #SACP Format
            dist = df_x.loc[(df_x["Source"].eq(g_tuple[0]) ) & 
                 (df_x["Target"].str.contains(g_tuple[1], regex=False))]
        elif semeru_format: #LibEST Format
            dist = df_x.loc[(df_x["Source"].str.contains('/' + g_tuple[0], regex=False) ) & 
                 (df_x["Target"].str.contains('/' + g_tuple[1], regex=False))]
        else: #By Default use Semeru Format
            dist = df_x[df_x[parameters['names'][0]].str.contains( g_tuple[0][:g_tuple[0].find('.')] + '-' ) 
                     & df_x[parameters['names'][1]].str.contains(g_tuple[1][:g_tuple[1].find('.')]) ]
        if not dist.index.values:
            print(dist.index.values, g_tuple)
        return dist.index.values

In [None]:
# Find matching for point-based groundtruth itrust/smos
def find_index_gt_sacp( tuple_g ):
    dist = df_x.loc[(df_x["Source"].eq( tuple_g[0]) ) & 
                 (df_x["Target"].str.contains(tuple_g[1], regex=False))]
    return dist.index.values

In [None]:
#Formatted for Semeru mode A
#matchGT = [ find_index_gt( g) for g in word2vec.ground_truth_processing(path_to_ground_truth)]
#matchGT = [ find_index_gt_point( g) for g in word2vec.ground_truth_processing(path_to_ground_truth)]
matchGT = [ find_index_gt(g, True, False) for g in word2vec.ground_truth_processing(path_to_ground_truth,True)]
matchGT

2021-01-26 01:49:44,391 : INFO : ground truth from mappings


[] ('295', 'test/python/third_party/temp/image_list.txt')
[] ('293', 'test/python/third_party/temp/image_list.txt')
[] ('274', 'Jenkinsfile')
[] ('289', 'sacp_python_common/spotbugs/binary/spotbugs-3.1.3.tgz')
[] ('289', 'sacp_python_common/template/aggregator.jinja.html')
[] ('291', 'sacp_python_common/spotbugs/binary/spotbugs-3.1.3.tgz')
[] ('292', 'sacp_python_common/spotbugs/binary/spotbugs-3.1.3.tgz')
[] ('290', 'sacp_python_common/spotbugs/binary/spotbugs-3.1.3.tgz')
[] ('290', 'sacp_python_common/template/aggregator.jinja.html')
[] ('278', 'sacp_python_common/spotbugs/binary/spotbugs-3.1.3.tgz')
[] ('278', 'sacp_python_common/spotbugs/binary/spotbugs-4.0.3.tgz')
[] ('276', 'Jenkinsfile')
[] ('272', 'Jenkinsfile')
[] ('271', 'README.md')
[] ('268', 'Jenkinsfile')
[] ('268', 'test/python/gosec/test_data/Gosec_Scan/scan_url.json')
[] ('267', 'test/python/gosec/test_data/Gosec_Scan/scan_url.json')
[] ('264', 'Jenkinsfile')
[] ('263', 'Jenkinsfile')
[] ('172', 'sacp_python_common/bin

[] ('210', 'test/python/third_party/BDImage_Report/BlackDuck_CVE_Components.json')
[] ('210', 'test/python/third_party/BDImage_Report/BlackDuck_Risk_Profile.json')
[] ('210', 'test/python/third_party/BDImage_Report/bd_sample_response.json')
[] ('210', 'test/python/third_party/BDImage_Report/blackduck_expected_output.json')
[] ('210', 'test/python/third_party/BDImage_Report/scan_url.json')
[] ('210', 'test/python/third_party/BlackDuck_Report/BlackDuck_BoM.json')
[] ('210', 'test/python/third_party/BlackDuck_Report/BlackDuck_CVE_Components.json')
[] ('210', 'test/python/third_party/BlackDuck_Report/BlackDuck_Risk_Profile.json')
[] ('210', 'test/python/third_party/BlackDuck_Report/blackduck_expected_output.json')
[] ('210', 'test/python/third_party/Corona_Report/BoM_Report.json')
[] ('210', 'test/python/third_party/Corona_Report/CVE_Report.json')
[] ('210', 'test/python/third_party/Corona_Report/bd_bom.json')
[] ('210', 'test/python/third_party/Corona_Report/bd_cve.json')
[] ('210', 'test

[] ('202', 'sacp_python_common/bandit/__init__.py')
[] ('202', 'sacp_python_common/banditdisplay.py')
[] ('202', 'sacp_python_common/binary_scan_func.py')
[] ('202', 'sacp_python_common/bom.jinja.html')
[] ('202', 'sacp_python_common/bom_bd.jinja.html')
[] ('202', 'sacp_python_common/cave/__init__.py')
[] ('202', 'sacp_python_common/bom_bd_image.jinja.html')
[] ('202', 'sacp_python_common/csbcicd_func.py')
[] ('202', 'sacp_python_common/cve.jinja.html')
[] ('202', 'sacp_python_common/cve_bd.jinja.html')
[] ('202', 'sacp_python_common/cve_bd_image.jinja.html')
[] ('202', 'sacp_python_common/gosec/__init__.py')
[] ('202', 'sacp_python_common/ipc.jinja.html')
[] ('202', 'sacp_python_common/security_results_push/__init__.py')
[] ('202', 'sacp_python_common/spotbugs.jinja.html')
[] ('202', 'sacp_python_common/template/bandit.jinja.html')
[] ('202', 'sacp_python_common/template/gosec.jinja.html')
[] ('202', 'sacp_python_common/triage.jinja.html')
[] ('202', 'test/python/bandit/__init__.py')


[] ('197', 'test/python/csbcicdReport/static/Spotbugs/sample_buginstance.json')
[] ('197', 'test/python/csbcicdReport/static/Spotbugs/sample_report.json')
[] ('197', 'test/python/csbcicdReport/static/Spotbugs/spotbugs_expected_output.json')
[] ('197', 'test/python/csbcicdReport/static/Spotbugs/spotbugs_sample.xml')
[] ('197', 'test/python/csbcicdReport/static/bandit/BanditReport.json')
[] ('197', 'test/python/csbcicdReport/static/bandit/banditResult.json')
[] ('197', 'test/python/csbcicdReport/static/bandit/banditResult2.json')
[] ('197', 'test/python/csbcicdReport/static/bandit/bandit_expected_output.json')
[] ('197', 'test/python/csbcicdReport/static/bandit/results.json')
[] ('197', 'test/python/test_bandit.py')
[] ('197', 'test/python/test_security_results_push_func.py')
[] ('194', '.gitignore')
[] ('194', '.pre-commit-config.yaml')
[] ('194', 'Jenkinsfile')
[] ('194', 'MANIFEST.in')
[] ('194', 'Pipfile')
[] ('194', 'Pipfile.lock')
[] ('194', 'README.md')
[] ('194', 'pom.xml')
[] ('

[] ('191', 'spotbugs.html')
[] ('191', 'test/python/csbcicdReport/__init__.py')
[] ('191', 'test/python/csbcicdReport/hardening/report_B.json')
[] ('191', 'test/python/harden_check/__init__.py')
[] ('191', 'test/python/harden_check/c9/json/empty_ip.json')
[] ('191', 'test/python/harden_check/c9/json/ip_address.json')
[] ('191', 'test/python/harden_check/hardening/report_B.json')
[] ('191', 'test/python/harden_check/report_B.json')
[] ('191', 'test/python/test_static_scan_builder.py')
[] ('191', 'test/python/test_static_scan_func.py')
[] ('191', 'test/python/testfiles/final_report.json')
[] ('191', 'test/python/testfiles/hardening/hardening/sysconfig.yaml')
[] ('191', 'test/python/testfiles/hardening/hardening/tmp/reports/json/ip_address.json')
[] ('191', 'test/python/testfiles/hardening/report_B.json')
[] ('184', 'test/python/test_gosec_display.py')
[] ('184', 'test/python/testfiles/Gosec_Scan/clean_results.json')
[] ('184', 'test/python/testfiles/Gosec_Scan/extra_large_results.json')


[] ('176', 'sacp_python_common/docker-bench-security/Dockerfile')
[] ('176', 'sacp_python_common/docker-bench-security/LICENSE.md')
[] ('176', 'sacp_python_common/docker-bench-security/MAINTAINERS')
[] ('176', 'sacp_python_common/docker-bench-security/README.md')
[] ('176', 'sacp_python_common/docker-bench-security/benchmark_log.png')
[] ('176', 'sacp_python_common/docker-bench-security/distros/Dockerfile.alpine')
[] ('176', 'sacp_python_common/docker-bench-security/distros/Dockerfile.centos')
[] ('176', 'sacp_python_common/docker-bench-security/distros/Dockerfile.debian')
[] ('176', 'sacp_python_common/docker-bench-security/distros/Dockerfile.openSUSE')
[] ('176', 'sacp_python_common/docker-bench-security/distros/Dockerfile.rhel')
[] ('176', 'sacp_python_common/docker-bench-security/distros/README.md')
[] ('176', 'sacp_python_common/docker-bench-security/docker-bench-security.sh')
[] ('176', 'sacp_python_common/docker-bench-security/docker-compose.yml')
[] ('176', 'sacp_python_common/

[] ('162', 'sacp_python_common/binaryScan.py')
[] ('162', 'sacp_python_common/binary_scan_func.py')
[] ('162', 'sacp_python_common/custom_scan.py')
[] ('162', 'sacp_python_common/gosec.py')
[] ('162', 'sacp_python_common/spotbugs.py')
[] ('162', 'sacp_python_common/static_scan_builder.py')
[] ('162', 'sacp_python_common/static_scan_func.py')
[] ('162', 'test/python/test_bd_report.py')
[] ('159', 'sacp_python_common/UploadBom.py')
[] ('159', 'sacp_python_common/binary_scan_func.py')
[] ('159', 'sacp_python_common/spotbugs.py')
[] ('159', 'test/python/test_HardenPostCheck.py')
[] ('159', 'test/python/test_UploadBom.py')
[] ('161', 'sacp_python_common/HubRestApi.py')
[] ('161', 'sacp_python_common/aggregator.jinja.html')
[] ('161', 'sacp_python_common/aggregator.py')
[] ('161', 'sacp_python_common/analytics.py')
[] ('161', 'sacp_python_common/analytics_func.py')
[] ('161', 'sacp_python_common/bandit.py')
[] ('161', 'sacp_python_common/bd_image.py')
[] ('161', 'sacp_python_common/bd_report

[] ('158', 'sacp_python_common/hardenCheck.py')
[] ('158', 'sacp_python_common/hardenPostCheck.py')
[] ('158', 'sacp_python_common/hardenReport.jinja.html')
[] ('158', 'sacp_python_common/hardenReport.py')
[] ('158', 'sacp_python_common/ipCentralScan.py')
[] ('158', 'sacp_python_common/jira.py')
[] ('158', 'sacp_python_common/jira_func.py')
[] ('158', 'sacp_python_common/kinesis_stream.py')
[] ('158', 'sacp_python_common/kinesis_stream_func.py')
[] ('158', 'sacp_python_common/norad_report.py')
[] ('158', 'sacp_python_common/norad_scan_builder.py')
[] ('158', 'sacp_python_common/report.py')
[] ('158', 'sacp_python_common/run_ipcentral_automation.py')
[] ('158', 'sacp_python_common/spotbugs.py')
[] ('158', 'test/python/test_HardenPostCheck.py')
[] ('158', 'test/python/test_analytics.py')
[] ('158', 'test/python/testfiles/hardening/hardening/report_B.json')
[] ('158', 'test/python/testfiles/hardening/hardening/tmp/report.json')
[] ('158', 'test/python/testfiles/hardening/report_B.json')
[

[] ('93', 'test/python/test_ipc.py')
[] ('93', 'tox.ini')
[] ('147', 'sacp_python_common/bandit.py')
[] ('147', 'sacp_python_common/bd_image.py')
[] ('147', 'sacp_python_common/binaryScan.py')
[] ('147', 'sacp_python_common/blackduck.py')
[] ('147', 'sacp_python_common/csbcicdReport.py')
[] ('147', 'sacp_python_common/customScan.py')
[] ('147', 'sacp_python_common/hardenCheck.py')
[] ('147', 'sacp_python_common/hardenPostCheck.py')
[] ('147', 'sacp_python_common/ipCentralScan.py')
[] ('147', 'sacp_python_common/spotbugs.py')
[] ('143', 'sacp_python_common/HubRestApi.py')
[] ('143', 'sacp_python_common/bd_image.py')
[] ('143', 'sacp_python_common/bd_image_script.sh')
[] ('143', 'sacp_python_common/bd_openSourceScan.sh')
[] ('143', 'sacp_python_common/bd_upload_script.sh')
[] ('143', 'sacp_python_common/binary_scan_func.py')
[] ('143', 'test/python/test_bd_image.py')
[] ('144', 'sacp_python_common/bandit.py')
[] ('144', 'sacp_python_common/bd_image.py')
[] ('144', 'sacp_python_common/bin

[] ('130', 'sacp_python_common/ipc.py')
[] ('130', 'sacp_python_common/ipc2.py')
[] ('130', 'sacp_python_common/process_bandit.py')
[] ('130', 'sacp_python_common/process_cave_ca_validation.py')
[] ('130', 'sacp_python_common/spotbugs.py')
[] ('130', 'sacp_python_common/triage.jinja.html')
[] ('130', 'sacp_python_common/triage.py')
[] ('130', 'test/python/test_HardenCheck.py')
[] ('130', 'test/python/test_bd_image.py')
[] ('130', 'test/python/test_bd_report.py')
[] ('130', 'test/python/test_blackduck.py')
[] ('130', 'test/python/test_bom.py')
[] ('130', 'test/python/test_bom_bd.py')
[] ('130', 'test/python/test_bom_bd_image.py')
[] ('130', 'test/python/test_csbcicdReport.py')
[] ('130', 'test/python/test_cve.py')
[] ('130', 'test/python/test_cve_bd.py')
[] ('130', 'test/python/test_cve_bd_image.py')
[] ('130', 'test/python/test_ipc.py')
[] ('130', 'test/python/test_ipc2.py')
[] ('130', 'test/python/test_spotbugs.py')
[] ('130', 'test/python/testfiles/Cave/CAVE-HOST/CAVE_VALIDATIONS_rar

[] ('104', 'test/python/testfiles/final_report.json')
[] ('125', 'sacp_python_common/csbcicd_func.py')
[] ('125', 'sacp_python_common/process_cave_ca_validation.py')
[] ('125', 'test/python/testfiles/Cave/CAVE-HOST/CAVE_VALIDATIONS_raresults.json')
[] ('124', 'sacp_python_common/csbcicd_func.py')
[] ('124', 'sacp_python_common/process_cave_ca_validation.py')
[] ('124', 'test/python/testfiles/Cave/CAVE-HOST/CAVE_VALIDATIONS_raresults.json')
[] ('63', 'sacp_python_common/aggregator.jinja.html')
[] ('121', 'sacp_python_common/bd_report.py')
[] ('121', 'sacp_python_common/binaryScan.py')
[] ('121', 'sacp_python_common/binary_scan_func.py')
[] ('121', 'sacp_python_common/bom.py')
[] ('121', 'sacp_python_common/bom_bd.py')
[] ('121', 'sacp_python_common/bom_bd_image.py')
[] ('121', 'sacp_python_common/cve.py')
[] ('121', 'sacp_python_common/cve_bd.py')
[] ('121', 'sacp_python_common/cve_bd_image.py')
[] ('121', 'sacp_python_common/ipc.py')
[] ('121', 'sacp_python_common/ipc2.py')
[] ('121', 

[] ('97', 'test/python/testfiles/CAVE-SSL/cave_ssl_results.json')
[] ('97', 'test/python/testfiles/CAVE-SSL/cavessl_expected_output.json')
[] ('97', 'test/python/testfiles/Cave/CAVE-HOST/CAVE_HOST_VALIDATIONS_raresults.json')
[] ('97', 'test/python/testfiles/Cave/CAVE-HOST/asset_information.json')
[] ('97', 'test/python/testfiles/Cave/CAVE-HOST/cave_host_expected.json')
[] ('97', 'test/python/testfiles/Cave/CAVE-SSL/cave_ssl_expected.json')
[] ('97', 'test/python/testfiles/Cave/CAVE-SSL/cave_ssl_results.json')
[] ('95', 'sacp_python_common/aggregator.jinja.html')
[] ('95', 'sacp_python_common/aggregator.py')
[] ('95', 'sacp_python_common/binary_scan_func.py')
[] ('95', 'sacp_python_common/csbcicdReport.py')
[] ('95', 'sacp_python_common/csbcicd_func.py')
[] ('95', 'sacp_python_common/norad_scan_builder.py')
[] ('95', 'sacp_python_common/process_cave_ca_validation.py')
[] ('95', 'sacp_python_common/report.py')
[] ('95', 'sacp_python_common/run_ipcentral_automation.py')
[] ('95', 'test/p

[] ('47', 'test/python/test_csbcicdReport.py')
[] ('51', 'sacp_python_common/HubRestApi.py')
[] ('51', 'sacp_python_common/bd_upload_script.sh')
[] ('51', 'sacp_python_common/binaryScan.py')
[] ('51', 'sacp_python_common/binary_scan_func.py')
[] ('51', 'sacp_python_common/blackduckIPSync.py')
[] ('51', 'sacp_python_common/bom_bd.jinja.html')
[] ('51', 'sacp_python_common/cve_bd.jinja.html')
[] ('51', 'sacp_python_common/report.py')
[] ('48', 'sacp_python_common/report.py')
[] ('46', 'sacp_python_common/ipCentralScan.py')
[] ('46', 'sacp_python_common/ipc.jinja.html')
[] ('34', 'sacp_python_common/docker-bench-security/CONTRIBUTING.md')
[] ('34', 'sacp_python_common/docker-bench-security/Dockerfile')
[] ('34', 'sacp_python_common/docker-bench-security/LICENSE.md')
[] ('34', 'sacp_python_common/docker-bench-security/MAINTAINERS')
[] ('34', 'sacp_python_common/docker-bench-security/README.md')
[] ('34', 'sacp_python_common/docker-bench-security/benchmark_log.png')
[] ('34', 'sacp_python_c

[array([2]),
 array([7]),
 array([13]),
 array([17]),
 array([19]),
 array([20]),
 array([40]),
 array([24]),
 array([23]),
 array([55]),
 array([], dtype=int64),
 array([60]),
 array([71]),
 array([59]),
 array([76]),
 array([81]),
 array([87]),
 array([91]),
 array([93]),
 array([94]),
 array([129]),
 array([188]),
 array([171]),
 array([], dtype=int64),
 array([208]),
 array([219]),
 array([207]),
 array([246]),
 array([245]),
 array([281]),
 array([], dtype=int64),
 array([387]),
 array([389]),
 array([], dtype=int64),
 array([390]),
 array([], dtype=int64),
 array([426]),
 array([], dtype=int64),
 array([464]),
 array([500]),
 array([538]),
 array([574]),
 array([], dtype=int64),
 array([609]),
 array([611]),
 array([], dtype=int64),
 array([612]),
 array([], dtype=int64),
 array([648]),
 array([706]),
 array([689]),
 array([764]),
 array([838]),
 array([912]),
 array([948]),
 array([986]),
 array([1022]),
 array([1060]),
 array([1134]),
 array([1133]),
 array([1170]),
 array([120

In [None]:
matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
matchGT

array([    2,     7,    13,    17,    19,    20,    40,    24,    23,
          55,    60,    71,    59,    76,    81,    87,    91,    93,
          94,   129,   188,   171,   208,   219,   207,   246,   245,
         281,   387,   389,   390,   426,   464,   500,   538,   574,
         609,   611,   612,   648,   706,   689,   764,   838,   912,
         948,   986,  1022,  1060,  1134,  1133,  1170,  1208,  1244,
        1282,  1281,  1352,  1430,  1503,  1578,  1577,  1613,  1651,
        1687,  1851,  1874,  1873,  1894,  1909,  1948,  1947,  1983,
        1999,  2042,  2084,  2085,  2095,  2124,  2158,  2159,  2198,
        2243,  2391,  2614,  2613,  2616,  2652,  2658,  2688,  2687,
        2738,  2741,  2744,  2756,  2757,  2761,  2766,  2767,  2832,
        2906,  3066,  3058,  3057,  3059,  3060,  3064,  3071,  3073,
        3094,  3096,  3102,  3206,  3205,  3207,  3242,  3280,  3279,
        3282,  3286,  3293,  3295,  3316,  3362,  3421,  3436,  3428,
        3427,  3460,

In [None]:
word2vec.ground_truth_processing(path_to_ground_truth, True)

2021-01-26 02:09:14,555 : INFO : ground truth from mappings


[('295', 'sacp_python_common/bandit/banditReport.py'),
 ('295', 'sacp_python_common/csbcicd_report/csbcicd_func.py'),
 ('295', 'sacp_python_common/gosec/gosec_report.py'),
 ('295', 'sacp_python_common/psb_mapping.py'),
 ('295',
  'sacp_python_common/security_results_push/security_results_push_func.py'),
 ('295', 'sacp_python_common/spotbugs/spotbugs.py'),
 ('295', 'sacp_python_common/third_party/UploadBom.py'),
 ('295', 'sacp_python_common/third_party/binaryScan.py'),
 ('295', 'sacp_python_common/third_party/binary_scan_func.py'),
 ('295',
  'test/python/security_results_push/test_security_results_push_func.py'),
 ('295', 'test/python/third_party/temp/image_list.txt'),
 ('295', 'test/python/third_party/test_BinaryScan.py'),
 ('295', 'test/python/third_party/test_UploadBom.py'),
 ('295', 'test/python/third_party/test_binary_scan_func.py'),
 ('294', 'sacp_python_common/bandit/banditReport.py'),
 ('294', 'sacp_python_common/csbcicd_report/csbcicd_func.py'),
 ('294', 'sacp_python_common/go

In [None]:
df_x[['Source','Target']].iloc[3962][0]

'190'

In [None]:
df_x[['Source','Target']].iloc[3962][1]

'sacp-python-common/sacp_python_common/third_party/UploadBom.py'

In [None]:
df_x[['Source','Target']].iloc[3978][0]

'190'

In [None]:
df_x[['Source','Target']].iloc[3978][1]

'sacp-python-common/test/python/spotbugs/test_spotbugs.py'

In [None]:
#Formatted for Semeru mode B
matchGT = [ word2vec.findDistInDF( g , semeru_format=True, from_mappings=False ) for g in word2vec.ground_truth_processing(path_to_ground_truth, True)]
matchGT

2021-01-26 02:09:47,714 : INFO : ground truth from mappings
2021-01-26 02:09:47,732 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,748 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,763 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,778 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,793 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,808 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,823 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,838 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,854 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,868 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,883 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,898 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,913 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,929 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,944 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:47,962 :

2021-01-26 02:09:49,771 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,786 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,802 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,817 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,832 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,846 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,861 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,876 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,891 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,907 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,922 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,937 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,955 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,970 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:49,985 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:50,000 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:50,015 

2021-01-26 02:09:51,839 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,853 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,868 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,884 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,899 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,915 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,930 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,945 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,962 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,978 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:51,994 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:52,009 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:52,027 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:52,042 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:52,058 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:52,073 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:52,088 

2021-01-26 02:09:53,903 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:53,918 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:53,933 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:53,948 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:53,966 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:53,981 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:53,997 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,012 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,027 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,042 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,058 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,073 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,091 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,107 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,122 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,137 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:54,153 

2021-01-26 02:09:55,985 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,000 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,015 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,029 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,045 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,060 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,076 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,093 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,109 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,124 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,140 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,155 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,170 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,186 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,201 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,216 : INFO : findDistInDF: semeru_format
2021-01-26 02:09:56,230 

KeyboardInterrupt: 

In [None]:
word2vec.df_ground_link.head(1)

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise
0,295,sacp-python-common/sacp_python_common/auth_uti...,1.181488,0.458403,0.75325,0.24675,3.022055,1.335963,4.715814,6.895876,6.97673,4.63496,2.260916,0.080854


In [None]:
#Formatted for SACP
matchGT = [ word2vec.findDistInDF( g , from_mappings=True ) for g in word2vec.ground_truth_processing(path_to_ground_truth,from_mappings=True)]
matchGT

2021-01-26 02:10:23,548 : INFO : ground truth from mappings
2021-01-26 02:10:23,560 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,572 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,583 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,595 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,605 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,617 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,628 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,640 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,652 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,663 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,674 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,686 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,698 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,710 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,721 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:23,733 :

2021-01-26 02:10:25,086 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,098 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,110 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,121 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,133 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,143 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,155 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,166 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,178 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,189 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,201 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,212 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,224 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,235 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,247 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,258 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:25,270 

2021-01-26 02:10:26,475 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,483 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,492 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,502 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,517 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,534 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,544 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,553 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,562 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,572 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,581 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,590 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,600 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,610 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,620 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,629 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:26,638 

2021-01-26 02:10:27,878 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,887 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,896 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,905 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,915 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,924 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,933 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,954 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,964 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,973 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,982 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:27,992 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:28,001 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:28,011 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:28,020 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:28,029 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:28,038 

2021-01-26 02:10:29,394 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,405 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,416 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,427 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,439 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,450 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,462 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,474 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,485 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,497 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,508 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,519 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,531 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,542 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,554 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,565 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:29,576 

2021-01-26 02:10:32,439 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,450 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,461 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,472 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,484 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,496 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,507 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,518 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,529 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,541 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,552 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,564 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,574 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,585 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,597 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,608 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:32,619 

2021-01-26 02:10:33,960 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:33,972 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:33,983 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:33,994 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,007 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,019 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,030 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,041 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,052 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,063 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,074 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,086 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,097 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,108 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,119 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,131 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:34,142 

2021-01-26 02:10:35,482 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,493 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,504 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,515 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,526 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,537 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,548 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,561 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,572 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,583 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,594 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,608 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,619 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,630 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,641 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,652 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:35,663 

2021-01-26 02:10:37,007 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,018 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,029 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,040 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,051 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,062 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,073 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,085 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,096 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,107 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,118 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,129 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,140 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,151 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,163 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,174 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:37,185 

2021-01-26 02:10:38,517 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,529 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,540 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,551 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,562 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,573 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,584 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,596 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,607 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,618 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,629 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,640 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,651 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,662 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,675 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,687 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:38,698 

2021-01-26 02:10:40,036 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,047 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,058 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,069 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,080 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,091 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,102 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,113 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,125 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,136 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,147 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,158 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,169 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,180 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,191 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,202 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:40,214 

2021-01-26 02:10:41,541 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,553 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,563 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,575 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,586 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,598 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,610 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,620 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,630 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,640 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,650 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,661 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,671 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,681 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,692 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,703 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:41,714 

2021-01-26 02:10:43,049 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,060 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,071 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,082 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,093 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,105 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,116 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,127 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,138 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,149 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,161 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,172 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,185 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,196 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,207 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,219 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:43,230 

2021-01-26 02:10:44,562 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,573 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,585 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,596 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,607 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,618 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,629 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,641 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,652 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,663 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,674 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,685 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,697 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,708 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,721 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,732 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:44,743 

2021-01-26 02:10:46,083 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,094 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,105 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,117 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,128 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,139 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,149 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,161 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,172 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,184 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,196 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,207 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,219 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,231 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,242 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,254 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:46,265 

2021-01-26 02:10:47,608 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,619 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,631 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,642 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,654 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,666 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,677 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,689 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,700 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,712 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,723 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,735 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,746 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,756 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,768 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,778 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:47,790 

2021-01-26 02:10:49,136 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,147 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,159 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,170 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,181 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,193 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,204 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,215 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,227 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,238 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,249 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,260 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,272 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,283 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,295 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,306 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:49,317 

2021-01-26 02:10:52,150 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,161 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,171 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,181 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,191 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,201 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,213 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,224 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,235 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,246 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,257 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,269 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,280 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,291 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,302 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,313 : INFO : findDistInDF: from_mappings
2021-01-26 02:10:52,324 

[array([2]),
 array([7]),
 array([13]),
 array([17]),
 array([19]),
 array([20]),
 array([40]),
 array([24]),
 array([23]),
 array([55]),
 array([], dtype=int64),
 array([60]),
 array([71]),
 array([59]),
 array([76]),
 array([81]),
 array([87]),
 array([91]),
 array([93]),
 array([94]),
 array([129]),
 array([188]),
 array([171]),
 array([], dtype=int64),
 array([208]),
 array([219]),
 array([207]),
 array([246]),
 array([245]),
 array([281]),
 array([], dtype=int64),
 array([387]),
 array([389]),
 array([], dtype=int64),
 array([390]),
 array([], dtype=int64),
 array([426]),
 array([], dtype=int64),
 array([464]),
 array([500]),
 array([538]),
 array([574]),
 array([], dtype=int64),
 array([609]),
 array([611]),
 array([], dtype=int64),
 array([612]),
 array([], dtype=int64),
 array([648]),
 array([706]),
 array([689]),
 array([764]),
 array([838]),
 array([912]),
 array([948]),
 array([986]),
 array([1022]),
 array([1060]),
 array([1134]),
 array([1133]),
 array([1170]),
 array([120

In [None]:
matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
matchGT

array([    2,     7,    13,    17,    19,    20,    40,    24,    23,
          55,    60,    71,    59,    76,    81,    87,    91,    93,
          94,   129,   188,   171,   208,   219,   207,   246,   245,
         281,   387,   389,   390,   426,   464,   500,   538,   574,
         609,   611,   612,   648,   706,   689,   764,   838,   912,
         948,   986,  1022,  1060,  1134,  1133,  1170,  1208,  1244,
        1282,  1281,  1352,  1430,  1503,  1578,  1577,  1613,  1651,
        1687,  1851,  1874,  1873,  1894,  1909,  1948,  1947,  1983,
        1999,  2042,  2084,  2085,  2095,  2124,  2158,  2159,  2198,
        2243,  2391,  2614,  2613,  2616,  2652,  2658,  2688,  2687,
        2738,  2741,  2744,  2756,  2757,  2761,  2766,  2767,  2832,
        2906,  3066,  3058,  3057,  3059,  3060,  3064,  3071,  3073,
        3094,  3096,  3102,  3206,  3205,  3207,  3242,  3280,  3279,
        3282,  3286,  3293,  3295,  3316,  3362,  3421,  3436,  3428,
        3427,  3460,

In [None]:
new_column = pd.Series(np.full([len(matchGT)], 1 ), name=word2vec.params['names'][2], index = matchGT)

In [None]:
new_column

In [None]:
new_column.size

In [None]:
#Some of the mappings are not found in the non-ling list because the mappings have all the ground truth of the issues
#it might include files not take into account in the non-links part
matchGT_ = [ (g,word2vec.findDistInDF( g , from_mappings=True )) for g in word2vec.ground_truth_processing(from_mappings=True)]

In [None]:
matchGT_

In [None]:
len(matchGT)

#### [step 3]Saving Non-GroundTruth Links

In [None]:
word2vec.SaveLinks()

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = LoadLinks(timestamp=1608688471.437005, params=parameters, logging = logger)
df_nonglinks.head()

#### [step 4]GroundTruthMatching Testing

In [None]:
word2vec.MatchWithGroundTruth(path_to_ground_truth, semeru_format=True)
word2vec.df_ground_link

#### [step 4.1]GroundTruthMatching Testing For CISCO Mappings <----- Warning SACP

In [None]:
word2vec.MatchWithGroundTruth(from_mappings=True)
word2vec.df_ground_link

2021-01-26 02:12:36,785 : INFO : ground truth from mappings
2021-01-26 02:12:36,799 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,811 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,823 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,834 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,844 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,856 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,867 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,879 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,891 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,901 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,913 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,925 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,936 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,948 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,960 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:36,972 :

2021-01-26 02:12:38,369 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,380 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,392 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,404 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,416 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,427 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,438 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,450 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,465 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,476 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,488 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,500 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,511 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,523 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,535 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,547 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:38,558 

2021-01-26 02:12:39,947 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:39,958 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:39,970 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:39,982 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:39,993 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,005 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,017 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,029 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,041 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,053 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,065 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,077 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,089 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,101 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,112 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,124 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:40,135 

2021-01-26 02:12:41,515 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,526 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,537 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,548 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,560 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,572 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,583 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,595 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,606 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,618 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,630 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,642 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,653 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,665 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,677 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,689 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:41,701 

2021-01-26 02:12:43,093 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,105 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,117 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,128 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,140 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,151 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,164 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,175 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,187 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,199 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,211 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,223 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,234 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,246 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,258 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,270 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:43,283 

2021-01-26 02:12:44,674 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,686 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,697 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,708 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,720 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,732 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,743 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,754 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,766 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,777 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,789 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,801 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,812 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,823 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,835 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,847 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:44,859 

2021-01-26 02:12:46,235 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,246 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,258 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,270 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,282 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,294 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,305 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,316 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,328 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,340 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,352 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,363 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,374 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,385 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,397 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,408 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:46,419 

2021-01-26 02:12:49,347 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,359 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,371 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,382 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,393 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,404 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,416 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,428 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,439 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,450 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,462 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,473 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,487 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,499 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,510 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,521 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:49,533 

2021-01-26 02:12:50,907 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,919 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,930 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,942 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,953 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,964 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,975 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,987 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:50,998 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,009 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,020 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,032 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,044 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,055 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,066 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,077 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:51,088 

2021-01-26 02:12:52,455 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,466 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,478 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,489 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,503 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,517 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,529 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,541 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,552 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,563 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,574 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,586 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,598 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,609 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,622 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,634 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:52,645 

2021-01-26 02:12:54,004 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,015 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,026 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,038 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,049 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,060 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,072 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,083 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,095 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,106 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,118 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,129 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,141 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,152 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,163 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,175 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:54,186 

2021-01-26 02:12:55,557 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,569 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,579 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,591 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,602 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,614 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,626 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,636 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,646 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,657 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,667 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,677 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,688 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,698 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,708 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,720 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:55,731 

2021-01-26 02:12:58,250 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,259 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,268 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,277 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,286 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,296 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,305 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,314 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,323 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,332 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,342 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,351 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,360 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,369 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,378 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,388 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:58,397 

2021-01-26 02:12:59,507 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,517 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,526 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,535 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,544 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,553 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,563 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,571 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,581 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,590 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,600 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,610 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,620 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,630 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,639 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,649 : INFO : findDistInDF: from_mappings
2021-01-26 02:12:59,659 

2021-01-26 02:13:00,959 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:00,971 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:00,982 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:00,994 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,006 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,018 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,031 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,044 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,056 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,068 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,080 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,091 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,103 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,115 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,126 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,138 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:01,149 

2021-01-26 02:13:02,530 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,542 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,553 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,565 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,576 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,587 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,599 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,610 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,622 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,633 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,645 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,656 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,668 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,679 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,691 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,702 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:02,714 

2021-01-26 02:13:04,089 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,101 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,111 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,123 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,134 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,146 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,157 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,169 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,181 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,192 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,204 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,216 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,228 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,239 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,250 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,262 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:04,273 

2021-01-26 02:13:05,631 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,641 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,651 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,661 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,672 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,683 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,694 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,705 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,717 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,728 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,740 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,751 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,762 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,773 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,785 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,796 : INFO : findDistInDF: from_mappings
2021-01-26 02:13:05,808 

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise,Linked?
0,295,sacp-python-common/sacp_python_common/auth_uti...,1.181488,0.458403,0.753250,0.246750,3.022055,1.335963,4.715814,6.895876,6.976730,4.634960,2.260916,0.080854,0.0
1,295,sacp-python-common/sacp_python_common/bandit/b...,1.211196,0.452244,0.770373,0.229627,3.084963,1.348006,4.715814,7.046810,7.105970,4.656654,2.390156,0.059159,0.0
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.212716,0.451933,0.790702,0.209298,2.855389,1.325019,4.715814,6.426571,6.493124,4.649261,1.777310,0.066553,1.0
3,295,sacp-python-common/sacp_python_common/cave/cav...,1.186251,0.457404,0.783486,0.216514,2.725481,1.319220,4.715814,6.098070,6.263189,4.550695,1.547375,0.165119,0.0
4,295,sacp-python-common/sacp_python_common/cave/cav...,1.182583,0.458173,0.776000,0.224000,2.584963,1.315172,4.715814,5.941919,6.184240,4.473493,1.468426,0.242321,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21307,2,sacp-python-common/test/python/third_party/tes...,1.181684,0.458361,0.747891,0.252109,1.370951,1.043856,4.715814,6.513137,6.676923,4.552028,1.961109,0.163786,0.0
21308,2,sacp-python-common/test/python/third_party/tes...,1.166092,0.461661,0.743289,0.256711,2.845351,1.321020,4.715814,6.770302,6.838744,4.647372,2.122930,0.068442,0.0
21309,2,sacp-python-common/test/python/third_party/tes...,1.122611,0.471118,0.730064,0.269936,3.022055,1.335963,4.715814,6.592422,6.676677,4.631559,1.960864,0.084255,0.0
21310,2,sacp-python-common/test/python/third_party/unu...,1.106469,0.474728,0.663793,0.336207,2.845351,1.321020,4.715814,6.446380,6.522552,4.639642,1.806738,0.076172,0.0


In [None]:
df_z = word2vec.df_ground_link
df_z[~df_z.isin([np.nan, np.inf, -np.inf]).any(1)]

In [None]:
#debug
df_y = word2vec.df_ground_link.copy()
df_y

In [None]:
#debug
df_y.update(new_column)

In [None]:
new_column

In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.Entropy_src,EntropyMetric.Entropy_tgt,EntropyMetric.JI,EntropyMetric.MI,EntropyMetric.Loss,EntropyMetric.Noise,Linked?
2,295,sacp-python-common/sacp_python_common/bandit/b...,1.212716,0.451933,0.790702,0.209298,2.855389,1.325019,4.715814,6.426571,6.493124,4.649261,1.777310,0.066553,1.0
7,295,sacp-python-common/sacp_python_common/csbcicd_...,1.174529,0.459870,0.755430,0.244570,3.391893,1.361094,4.715814,6.856857,6.880450,4.692221,2.164636,0.023593,1.0
13,295,sacp-python-common/sacp_python_common/gosec/go...,1.211065,0.452271,0.775642,0.224358,2.985228,1.333029,4.715814,6.900195,6.953526,4.662483,2.237712,0.053331,1.0
17,295,sacp-python-common/sacp_python_common/psb_mapp...,1.201492,0.454237,0.792145,0.207855,2.807355,1.334355,4.715814,6.562993,6.616780,4.662026,1.900966,0.053787,1.0
19,295,sacp-python-common/sacp_python_common/security...,1.169767,0.460879,0.756440,0.243560,3.642150,1.374004,4.715814,6.923628,6.948965,4.690477,2.233151,0.025337,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18141,43,sacp-python-common/sacp_python_common/fireExce...,1.190064,0.456608,0.819714,0.180286,0.918296,0.918296,3.923856,5.304981,5.676504,3.552333,1.752648,0.371523,1.0
18215,44,sacp-python-common/sacp_python_common/fireExce...,1.255032,0.443453,0.922734,0.077266,0.000000,0.000000,3.456565,5.304981,5.581912,3.179634,2.125347,0.276931,1.0
18289,42,sacp-python-common/sacp_python_common/fireExce...,1.190239,0.456571,0.824722,0.175278,1.000000,1.000000,4.789015,5.304981,5.872773,4.221224,1.083757,0.567791,1.0
18955,33,sacp-python-common/sacp_python_common/fireExce...,1.207428,0.453016,0.840158,0.159842,1.000000,1.000000,2.807355,5.304981,5.429116,2.683220,2.621762,0.124135,1.0


In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1].shape #Positive Links

In [None]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links[141] # A tuple

In [None]:
#Inspecting Source
ground_links[141][0][:ground_links[141][0].find('.')] + '-'

In [None]:
#Inspecting Target
ground_links[141][1][:ground_links[141][1].find('.')]

In [None]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = LoadLinks(timestamp=1609858614.151381, params=parameters,logging=logging,grtruth = True)
df_glinks.head()

In [None]:
df_glinks[df_glinks["Linked?"] == 0]

## 3. Artifacts Similarity with Doc2Vec

Try to reproduce the same empirical evaluation like here: [link](https://arxiv.org/pdf/1507.07998.pdf). Pay attention to:
- Accuracy vs. Dimensionality (we can replace accuracy for false positive rate or true positive rate)
- Visualize paragraph vectors using t-sne
- Computing Cosine Distance and Similarity. More about similarity [link](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html)

In [None]:
#experiment 0.0.1

In [None]:
path_to_trained_model = path_data+'/models/pv/bpe8k/[doc2vec-Py-Java-PVDBOW-500-20E-8k-1594572857.17191].model'

In [None]:
def doc2vec_params():
    return {
        "vectorizationType": VectorizationType.doc2vec,
        "linkType": LinkType.req2tc,
        "system": 'libest',
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "source_type": SoftwareArtifacts.REQ.value,
        "target_type": SoftwareArtifacts.TC.value,
        "path_to_trained_model": path_to_trained_model,
        "saving_path":  path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_model_prefix, #For BPE Analysis
    }

In [None]:
doc2vec_params = doc2vec_params()
doc2vec_params

In [None]:
#Export
class Doc2VecSeqVect(BasicSequenceVectorization):
    
    def __init__(self, params, logging):
        super().__init__(params, logging)
        self.new_model = gensim.models.Doc2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        self.df_inferred_src = None
        self.df_inferred_trg = None
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.EUC: self.euclidean_scipy,
            DistanceMetric.MAN: self.manhattan_scipy
        }
        self.logging.info("d2v loaded")
    
    def distance(self, metric_list, link):
        '''Iterate on the metrics'''
        ν_inferredSource = self.df_inferred_src[self.df_inferred_src['ids'].str.contains(link[0])]['inf-doc2vec'].values[0]
        w_inferredTarget = self.df_inferred_trg[self.df_inferred_trg['ids'].str.contains(link[1])]['inf-doc2vec'].values[0]
        
        dist = [ self.dict_distance_dispatcher[metric](ν_inferredSource,w_inferredTarget) for metric in metric_list]
        self.logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    """
    def computeDistanceMetric(self, links, metric_list):
        '''It is computed the cosine similarity'''
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)
    """
    
    def InferDoc2Vec(self, steps=200):
        '''Activate Inference on Target and Source Corpus'''
        self.df_inferred_src = self.df_source.copy()
        self.df_inferred_trg = self.df_target.copy()
        
        text = self.params['system_path_config']['names'][1]
        self.df_inferred_src['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_src[text].values]
        self.df_inferred_trg['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_trg[text].values]
        
        self.logging.info("Infer Doc2Vec on Source and Target Complete")

### Testing Doc2Vec SequenceVectorization

In [None]:
doc2vec = Doc2VecSeqVect(params = doc2vec_params, logging = logger)

In [None]:
doc2vec.df_source.head(2)

In [None]:
#[step1]Apply Doc2Vec Inference
doc2vec.InferDoc2Vec( steps = 200 )

In [None]:
doc2vec.df_inferred_src.head(2)

In [None]:
len(doc2vec.df_inferred_src['inf-doc2vec'].values[35])

In [None]:
len(doc2vec.df_inferred_src['inf-doc2vec'].values[36])

In [None]:
#test_inferDoc2Vec_trg = inferDoc2Vec(df_target)
#test_inferDoc2Vec_trg.head()
doc2vec.df_inferred_trg.head(2)

In [None]:
#tst correlation
pearsonr(doc2vec.df_inferred_trg['inf-doc2vec'][0], doc2vec.df_inferred_trg['inf-doc2vec'][0])

In [None]:
len(doc2vec.df_inferred_src['inf-doc2vec'])

In [None]:
pearsonr(doc2vec.df_inferred_trg['inf-doc2vec'][0], doc2vec.df_inferred_src['inf-doc2vec'][35])

In [None]:
#[step 2]NonGroundTruth Computation
metric_l = [DistanceMetric.EUC,DistanceMetric.COS,DistanceMetric.MAN, SimilarityMetric.Pearson]
doc2vec.ComputeDistanceArtifacts( sampling=False, samples = 50, metric_list = metric_l )
doc2vec.df_nonground_link.head()

In [None]:
#[step 3]Saving Non-GroundTruth Links
doc2vec.SaveLinks()

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks_doc2vec = LoadLinks(timestamp=1608688610.900933, params=doc2vec_params, logging = logger)
df_nonglinks_doc2vec.head()

In [None]:
#[step 4]GroundTruthMatching Testing
doc2vec.MatchWithGroundTruth(path_to_ground_truth)
doc2vec.df_ground_link

In [None]:
#[step 5]Saving GroundTruth Links
doc2vec.SaveLinks(grtruth = True)

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks_doc2vec = LoadLinks(timestamp=1608688652.964024, params=doc2vec_params, logging = logger, grtruth = True)
df_glinks_doc2vec.head()