In [None]:
# default_exp mining.ir

# Information Retrieval and Traceability Interfaces
> Implementing Common Information Retrieval Interfaces
> Author: @danaderp December 2020

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

In [None]:
# ! pip install -e . <----- Install in the console

In [None]:
#export
import numpy as np
import gensim
import pandas as pd
from itertools import product 
from random import sample 
import functools 
import os

In [None]:
#export
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora
from datetime import datetime
from enum import Enum, unique, auto
from ds4se.mgmnt.prep.conv import *

In [None]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
#export
from scipy.spatial import distance
from scipy.stats import pearsonr

In [None]:
#Export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

## Enums

In [None]:
#export
#@unique
class VectorizationType(Enum):
    word2vec = auto()
    doc2vec = auto()
    vsm2vec = auto()

In [None]:
VectorizationType.word2vec

<VectorizationType.word2vec: 1>

In [None]:
#export
#@unique
class DistanceMetric(Enum):
    WMD = auto()
    COS = auto()
    SCM = auto()
    EUC = auto()
    MAN = auto()

In [None]:
#export
#@unique
class SimilarityMetric(Enum):
    WMD_sim = auto()
    COS_sim = auto()
    SCM_sim = auto()
    EUC_sim = auto()
    MAN_sim = auto()
    Pearson = auto()

In [None]:
#export
class EntropyMetric(Enum):
    MSI_I = auto() #Minimum shared information Entropy
    MSI_X = auto() #Minimum shared information Extropy
    MI = auto() #Mutual information
    JI = auto() #Joint information
    Loss = auto() #Conditioned Entropy given the output I(x|y)
    Noise = auto() #Conditioned Entropy given the input I(y|x)
    Entropy_src = auto() #Self  Information src artifacts
    Entropy_tgt = auto() #Self Information target artifacts

In [None]:
#export
class SoftwareArtifacts(Enum):
    REQ = 'req'
    TC = 'tc'
    SRC = 'src'
    PY = 'py'
    PR = 'pr'
    UC = 'uc'

In [None]:
#export
#@unique
class Preprocessing(Enum):
    conv = auto()
    bpe = auto()

In [None]:
#export
#@unique
class LinkType(Enum):
    req2tc = auto()
    req2src = auto()
    issue2src = auto()
    pr2src = auto()
    uc2src = auto()
    uc2tc = auto()

In [None]:
#tst
LinkType.req2tc

<LinkType.req2tc: 1>

In [None]:
#tst
Preprocessing.bpe

<Preprocessing.bpe: 2>

## 1. Setting-Up Testing Environment

In [None]:
#hide
path_data = '../dvc-ds4se/' #dataset path

In [None]:
#hide
#experiment 0.0.0
#check it out in https://docs.google.com/spreadsheets/d/1UggaKFK8Qr5YltG_X9dN9BUlgH-GNiAbfPkcqSxjyoo/edit?usp=sharing
path_to_trained_model = path_data+'models/wv/bpe8k/[word2vec-Java-Py-SK-500-20E-8k-1594090297.869643].model'
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'

In [None]:
#hide
#experiment 0.0.0
#Experiment 1 with Libest Conv preprocessing
def libest_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2tc,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.REQ.value,
        "target_type": SoftwareArtifacts.TC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "saving_path": path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_model_prefix, #For BPE Analysis
        "path_mappings": path_data + 'se-benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt',    
    }

In [None]:
#Experiment 0.0.6
path_to_trained_model = path_data+'models/wv/conv/[word2vec-Py-Java-SK-500-20E-1592607739.629433].model'
def etour_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.uc2src,
        "system": 'etour',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.UC.value,
        "target_type": SoftwareArtifacts.SRC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/testbeds/processed/[etour-all-corpus-1609209368.279199].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "path_mappings": path_data + "se-benchmarking/traceability/testbeds/groundtruth/italian/[etour-ground-uc-to-src].txt",
        "saving_path": path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
    }

In [None]:
#Experiment 0.0.7
path_to_trained_model = path_data+'models/wv/conv/[word2vec-Py-Java-SK-500-20E-1592607739.629433].model'
def itrust_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.uc2src,
        "system": 'itrust',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.UC.value,
        "target_type": SoftwareArtifacts.SRC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/testbeds/processed/[itrust-all-corpus-1609210989.304283].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "path_mappings": path_data + "se-benchmarking/traceability/testbeds/groundtruth/english/[itrust-ground-uc-to-src].txt",
        "saving_path": path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
    }

In [None]:
#Experiments 1.1.2 <<-- word2vec
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_32k'
path_to_trained_model = path_data+'/models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model'
def sacp_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.PR.value,
        "target_type": SoftwareArtifacts.PY.value,
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
            "sep": '~',
            "names": ['ids','bpe32k'],
            "prep": Preprocessing.bpe
        },
        "path_mappings": "/tf/data/cisco/sacp_data/sacp-pr-mappings.csv",
        "saving_path": path_data + 'metrics/traceability/experiments1.1.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix": path_model_prefix

        }

In [None]:
#hide
parameters = sacp_params()
parameters

{'vectorizationType': <VectorizationType.word2vec: 1>,
 'linkType': <LinkType.issue2src: 3>,
 'system': 'sacp-python-common',
 'path_to_trained_model': '../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model',
 'source_type': 'pr',
 'target_type': 'py',
 'system_path_config': {'system_path': '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
  'sep': '~',
  'names': ['ids', 'bpe32k'],
  'prep': <Preprocessing.bpe: 2>},
 'path_mappings': '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
 'saving_path': '../dvc-ds4se/metrics/traceability/experiments1.1.x/',
 'names': ['Source', 'Target', 'Linked?'],
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_32k'}

In [None]:
parameters['source_type']

'pr'

In [None]:
#tst
parameters['system_path_config']['system_path']

'/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv'

In [None]:
#tst
parameters['system_path_config']['names'][1]

'bpe32k'

In [None]:
parameters['system_path_config']['sep'] #tst

'~'

In [None]:
#hide
df_all_system = pd.read_csv(
            parameters['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = parameters['system_path_config']['sep'] 
        )

In [None]:
df_all_system.head(1)

Unnamed: 0,ids,text,type,conv,bpe8k,bpe32k,bpe128k
0,295,Production Merge * Feed release name through t...,pr,product merg feed releas name upload bom allow...,"['▁production', '▁mer', 'ge', '▁*', '▁feed', '...","['▁production', '▁merge', '▁*', '▁feed', '▁rel...","['▁production', '▁merge', '▁*', '▁feed', '▁rel..."


In [None]:
#hide
tag = parameters['system_path_config']['names'][1]
[doc.split() for doc in df_all_system[df_all_system[tag].notnull()][tag].values]

[["['▁production',",
  "'▁merge',",
  "'▁*',",
  "'▁feed',",
  "'▁release',",
  "'▁name',",
  "'▁through',",
  "'▁to',",
  "'▁up',",
  "'load',",
  "'b',",
  "'om',",
  "'▁(#',",
  "'29',",
  "'3',",
  "'▁)',",
  "'▁*',",
  "'▁allow',",
  "'▁append',",
  "'▁images',",
  "'▁(#',",
  "'28',",
  "'7',",
  "'▁)',",
  "'▁*',",
  "'▁dare',",
  "'▁test',",
  "'▁fields',",
  "'▁(#',",
  "'29',",
  "'4)']"],
 ["['▁add',",
  "'▁test',",
  "'▁fields',",
  "'▁for',",
  "'▁dare',",
  "'▁push',",
  "'▁*',",
  "'▁added',",
  "'▁test',",
  "'▁data',",
  "'▁to',",
  "'▁the',",
  "'▁j',",
  "'son',",
  "'▁being',",
  "'▁sent',",
  "'▁to',",
  "'▁dare',",
  "'▁when',",
  "'▁running',",
  "'▁static',",
  "'▁code',",
  "'▁analysis',",
  "'.',",
  "'▁example',",
  "'▁ps',",
  "'b',",
  "'▁data',",
  "'▁below',",
  "'.',",
  "'▁closes',",
  "'▁sac',",
  "'p',",
  "'/',",
  "'cs',",
  "'b',",
  "'-',",
  "'c',",
  "'ic',",
  "'dp',",
  "'ip',",
  "'eline',",
  "'ed',",
  "'ition',",
  "'#',",
  "'38',",
  "'1

In [None]:
len(df_all_system[tag].values) #tst

362

In [None]:
#tst
len(df_all_system[df_all_system[tag].notnull()]) #some files are _init_ thefore are empty

362

In [None]:
#tst
df_all_system[df_all_system[tag].notnull()][tag].values

array(["['▁production', '▁merge', '▁*', '▁feed', '▁release', '▁name', '▁through', '▁to', '▁up', 'load', 'b', 'om', '▁(#', '29', '3', '▁)', '▁*', '▁allow', '▁append', '▁images', '▁(#', '28', '7', '▁)', '▁*', '▁dare', '▁test', '▁fields', '▁(#', '29', '4)']",
       '[\'▁add\', \'▁test\', \'▁fields\', \'▁for\', \'▁dare\', \'▁push\', \'▁*\', \'▁added\', \'▁test\', \'▁data\', \'▁to\', \'▁the\', \'▁j\', \'son\', \'▁being\', \'▁sent\', \'▁to\', \'▁dare\', \'▁when\', \'▁running\', \'▁static\', \'▁code\', \'▁analysis\', \'.\', \'▁example\', \'▁ps\', \'b\', \'▁data\', \'▁below\', \'.\', \'▁closes\', \'▁sac\', \'p\', \'/\', \'cs\', \'b\', \'-\', \'c\', \'ic\', \'dp\', \'ip\', \'eline\', \'ed\', \'ition\', \'#\', \'38\', \'1\', \'▁*\', \'▁added\', \'▁2\', \'▁additional\', \'▁sc\', \'f\', \'▁m\', \'app\', \'ings\', \'▁*\', \'▁[\', \'test\', \'▁builds\', \']\', \'(\', \'https\', \'://\', \'eng\', \'ci\', \'-\', \'jen\', \'kins\', \'-\', \'rt\', \'p\', \'.\', \'c\', \'isco\', \'.\', \'com\', \'/\', \

In [None]:
#tst
df_all_system.loc[df_all_system['type'] == parameters['source_type']][parameters['system_path_config']['names']]

Unnamed: 0,ids,bpe32k
0,295,"['▁production', '▁merge', '▁*', '▁feed', '▁rel..."
1,294,"['▁add', '▁test', '▁fields', '▁for', '▁dare', ..."
2,293,"['▁allow', '▁passing', '▁a', '▁release', '▁to'..."
3,287,"['▁allow', '▁append', '▁images', '▁#3', '63', ..."
4,274,"['▁move', '▁dock', 'er', '/', 'black', 'd', 'u..."
...,...,...
283,7,"['▁update', '▁bd', 'sc', 'an']"
284,4,"['▁syn', 'ch']"
285,5,"['▁syn', 'c']"
286,1,"['▁temporarily', '▁dis', 'able', '▁green', '▁t..."


In [None]:
df_all_system.loc[df_all_system['type'] == parameters['target_type']][parameters['system_path_config']['names']]

Unnamed: 0,ids,bpe32k
1,sacp-python-common/sacp_python_common/auth_uti...,"['▁""""""', '\r\n', 'created', '▁on', '▁aug', '▁1..."
3,sacp-python-common/sacp_python_common/bandit/b...,"['▁#', '!', '/', 'us', 'r', '/', 'bin', '/', '..."
4,sacp-python-common/sacp_python_common/bandit/b...,"['▁import', '▁j', 'son', '\r\n\r\n', 'from', '..."
6,sacp-python-common/sacp_python_common/cave/cav...,"['▁#', '!', '/', 'us', 'r', '/', 'bin', '/', '..."
7,sacp-python-common/sacp_python_common/cave/cav...,"['▁#', '!', '/', 'us', 'r', '/', 'bin', '/', '..."
...,...,...
92,sacp-python-common/test/python/third_party/tes...,"['▁import', '▁os', '\r\n', 'im', 'port', '▁un'..."
93,sacp-python-common/test/python/third_party/tes...,"['▁import', '▁os', '\r\n', 'im', 'port', '▁un'..."
94,sacp-python-common/test/python/third_party/tes...,"['▁import', '▁un', 'itt', 'est', '\r\n', 'from..."
95,sacp-python-common/test/python/third_party/unu...,"['▁#', '▁import', '▁j', 'son', '\r\n', '#', '▁..."


## 1. Defining BasicSequenceVectorization

In [None]:
#tst
print(list(VectorizationType), list(DistanceMetric), list(SimilarityMetric), list(LinkType))

[<VectorizationType.word2vec: 1>, <VectorizationType.doc2vec: 2>, <VectorizationType.vsm2vec: 3>] [<DistanceMetric.WMD: 1>, <DistanceMetric.COS: 2>, <DistanceMetric.SCM: 3>, <DistanceMetric.EUC: 4>, <DistanceMetric.MAN: 5>] [<SimilarityMetric.WMD_sim: 1>, <SimilarityMetric.COS_sim: 2>, <SimilarityMetric.SCM_sim: 3>, <SimilarityMetric.EUC_sim: 4>, <SimilarityMetric.MAN_sim: 5>, <SimilarityMetric.Pearson: 6>] [<LinkType.req2tc: 1>, <LinkType.req2src: 2>, <LinkType.issue2src: 3>, <LinkType.pr2src: 4>, <LinkType.uc2src: 5>, <LinkType.uc2tc: 6>]


In [None]:
#export
class BasicSequenceVectorization():
    '''Implementation of the class sequence-vanilla-vectorization other classes can inheritance this one'''
    def __init__(self, params, logging):
                
        self.params = params
        self.logging = logging
        self.df_nonground_link = None
        self.df_ground_link = None
        bpe = Preprocessing.bpe == self.params['system_path_config']['prep']
        self.prep = ConventionalPreprocessing(self.params, bpe = bpe)
        
        self.df_all_system = pd.read_csv(
            self.params['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = self.params['system_path_config']['sep'] 
        )
        
        #self.df_source = pd.read_csv(params['source_path'], names=['ids', 'text'], header=None, sep=' ')
        #self.df_target = pd.read_csv(params['target_path'], names=['ids', 'text'], header=None, sep=' ')
        self.df_source = self.df_all_system.loc[self.df_all_system['type'] == self.params['source_type']][self.params['system_path_config']['names']]
        self.df_target = self.df_all_system.loc[self.df_all_system['type'] == self.params['target_type']][self.params['system_path_config']['names']]
        
        #NA verification
        tag = self.params['system_path_config']['names'][1]
        self.df_source[tag] = self.df_source[tag].fillna("")
        self.df_target[tag] = self.df_target[tag].fillna("")
        
        ## self.document and self.dictionary is the vocabulary of the traceability corpus
        ## Do not confuse it with the dictionary of the general vectorization model
        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            self.documents = [doc.split() for doc in self.df_all_system[self.df_all_system[tag].notnull()][tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            self.vocab = dict.fromkeys( self.dictionary.token2id.keys(),0 )
            self.logging.info("conventional preprocessing documents, dictionary, and vocab for the test corpus")
        
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            self.documents = [eval(doc) for doc in self.df_all_system[tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            self.computing_bpe_vocab(tag=tag)
            self.logging.info("bpe preprocessing documents, dictionary, and vocab for the test corpus")
        
        
        #This can be extended for future metrics <---------------------
        self.dict_labels = {
            DistanceMetric.COS:[DistanceMetric.COS, SimilarityMetric.COS_sim],
            SimilarityMetric.Pearson:[SimilarityMetric.Pearson],
            DistanceMetric.EUC:[DistanceMetric.EUC, SimilarityMetric.EUC_sim],
            DistanceMetric.WMD:[DistanceMetric.WMD, SimilarityMetric.WMD_sim],
            DistanceMetric.SCM:[DistanceMetric.SCM, SimilarityMetric.SCM_sim],
            DistanceMetric.MAN:[DistanceMetric.MAN, SimilarityMetric.MAN_sim],
            EntropyMetric.MSI_I:[EntropyMetric.MSI_I, EntropyMetric.MSI_X],
            EntropyMetric.MI:[EntropyMetric.Entropy_src, EntropyMetric.Entropy_tgt,
                              EntropyMetric.JI, EntropyMetric.MI,
                              EntropyMetric.Loss, EntropyMetric.Noise
                             ]
        }

    def computing_bpe_vocab(self,tag):
        ####INFO science params
        abstracted_vocab = [ set( eval(doc) ) for doc in self.df_all_system[ tag ].values] #creation of sets
        abstracted_vocab = functools.reduce( lambda a,b : a.union(b), abstracted_vocab ) #union of sets
        self.vocab = {self.prep.sp_bpe.id_to_piece(id): 0 for id in range(self.prep.sp_bpe.get_piece_size())}
        dict_abs_vocab = { elem : 0 for elem in abstracted_vocab - set(self.vocab.keys()) } #Ignored vocab by BPE
        self.logging.info('Ignored vocab by BPE' + str(abstracted_vocab - set(self.vocab.keys())) )
        self.vocab.update(dict_abs_vocab) #Updating
    
    def ground_truth_processing(self, path_to_ground_truth = '', from_mappings = False):
        'Optional class when corpus has ground truth. This function create tuples of links'
        
        if from_mappings:
            df_mapping = pd.read_csv(self.params['path_mappings'], header = 0, sep = ',')
            ground_links = list(zip(df_mapping['id_pr'].astype(str), df_mapping['doc_id']))
        else:
            ground_truth = open(path_to_ground_truth,'r')
            #Organizing The Ground Truth under the given format
            ground_links = [ [(line.strip().split()[0], elem) for elem in line.strip().split()[1:]] for line in ground_truth]
            ground_links = functools.reduce(lambda a,b : a+b,ground_links) #reducing into one list
            #assert len(ground_links) ==  len(set(ground_links)) 
            #To Verify Redundancies in the file
            if len(ground_links) !=  len(set(ground_links)):
                ground_links = list(set(ground_links))
                self.logging.warning("-----WARNING!-------- Redundacy in the ground truth file")

        return ground_links
    
    def samplingLinks(self, sampling = False, samples = 10, basename = False):
        
        if basename:
            source = [os.path.basename(elem) for elem in self.df_source['ids'].values ] 
            target = [os.path.basename(elem) for elem in self.df_target['ids'].values ]
        else:
            source = self.df_source['ids'].values
            target = self.df_target['ids'].values

        if sampling:
            links = sample( list( product( source , target ) ), samples)
        else:
            links = list( product( source , target ))

        return links
    
    def cos_scipy(self, vector_v, vector_w):
        cos =  distance.cosine( vector_v, vector_w )
        return [cos, 1.-cos]
    
    def euclidean_scipy(self, vector_v, vector_w):
        dst = distance.euclidean(vector_v,vector_w)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def manhattan_scipy(self, vector_v, vector_w):
        dst = distance.cityblock(vector_v,vector_w)
        n = len(vector_v)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def pearson_abs_scipy(self, vector_v, vector_w):
        '''We are not sure that pearson correlation works well on doc2vec inference vectors'''
        #vector_v =  np.asarray(vector_v, dtype=np.float32)
        #vector_w =  np.asarray(vector_w, dtype=np.float32)
        #logging.info("pearson_abs_scipy"  + 'len: ' + str(len(vector_v)) + 'type: ' + str(type(vector_v)) )
        #logging.info("pearson_abs_scipy"  + 'len: ' + str(len(vector_w)) + 'type: ' + str(type(vector_w)) )
        corr, _ = pearsonr(vector_v, vector_w)
        return [abs(corr)] #Absolute value of the correlation
    

    def computeDistanceMetric(self, links, metric_list):
        '''Metric List Iteration''' 
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)
    
    def ComputeDistanceArtifacts(self, metric_list, sampling = False , samples = 10, basename = False):
        '''Activates Distance and Similarity Computations
        @metric_list if [] then Computes All metrics
        @sampling is False by the default
        @samples is the number of samples (or links) to be generated'''
        links_ = self.samplingLinks( sampling, samples, basename )
        
        docs, metric_labels = self.computeDistanceMetric( metric_list=metric_list, links=links_) #checkpoints
        self.df_nonground_link = pd.DataFrame(docs, columns =[self.params['names'][0], self.params['names'][1]]+ metric_labels) #Transforming into a Pandas
        self.logging.info("Non-groundtruth links computed")
        pass 
    
    
    def SaveLinks(self, grtruth=False, sep=' ', mode='a'):
        timestamp = datetime.timestamp(datetime.now())
        path_to_link = self.params['saving_path'] + '['+ self.params['system'] + '-' + str(self.params['vectorizationType']) + '-' + str(self.params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
        
        if grtruth:
            self.df_ground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        else:
            self.df_nonground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        
        self.logging.info('Saving in...' + path_to_link)
        pass
    
    def findDistInDF(self, g_tuple, from_mappings=False, semeru_format=False):
        '''Return the index values of the matched mappings
        .eq is used for Source since it must match the exact code to avoid number substrings
        for the target, the substring might works fine
        '/' is aggregated before the tuple to avoid matching more then one substring
        '''

        if from_mappings: #SACP Format
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].eq(g_tuple[0]) ) & 
                 (self.df_ground_link["Target"].str.contains('/' + g_tuple[1], regex=False))]
            self.logging.info('findDistInDF: from_mappings')
        elif semeru_format: #LibEST Format
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].str.contains('/' + g_tuple[0], regex=False) ) & 
                 (self.df_ground_link["Target"].str.contains('/' + g_tuple[1], regex=False))]
            self.logging.info('findDistInDF: semeru_format')
        else: #By Default use Semeru Format
            dist = self.df_ground_link[self.df_ground_link[self.params['names'][0]].str.contains( g_tuple[0][:g_tuple[0].find('.')] + '-' ) 
                     & self.df_ground_link[self.params['names'][1]].str.contains(g_tuple[1][:g_tuple[1].find('.')]) ]
            self.logging.info('findDistInDF: default')
        return dist.index.values
    
        
    def MatchWithGroundTruth(self, path_to_ground_truth='', from_mappings=False, semeru_format=False ):
        self.df_ground_link = self.df_nonground_link.copy()
        self.df_ground_link[self.params['names'][2]] = 0
        
        matchGT = [ self.findDistInDF( g , from_mappings=from_mappings, semeru_format=semeru_format ) for g in self.ground_truth_processing(path_to_ground_truth,from_mappings)]
        matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
        new_column = pd.Series(np.full([len(matchGT)], 1 ), name=self.params['names'][2], index = matchGT)
        
        self.df_ground_link.update(new_column)
        self.logging.info("Groundtruth links computed")
        pass

### Testing BasicSequenceVectorization

In [None]:
general2vec =  BasicSequenceVectorization(params = parameters, logging =logging)

2021-01-16 14:50:41,599 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-16 14:50:41,673 : INFO : built Dictionary(3010 unique tokens: ['28', '29', '3', '4)', '7']...) from 362 documents (total 171602 corpus positions)
2021-01-16 14:50:41,897 : INFO : Ignored vocab by BPE{'\r\n\r\n', '`', '\r\n\r\n@', '^', '\r\n', '\t', 'γ', '```', '\r\n\r\n\r\n', '\\', '@'}
2021-01-16 14:50:41,898 : INFO : bpe preprocessing documents, dictionary, and vocab for the test corpus


In [None]:
general2vec.params['system_path_config']['names'][1]

'bpe32k'

In [None]:
abstracted_vocab = [ set( eval(doc) ) for doc in general2vec.df_all_system[ 'bpe32k' ].values]

In [None]:
abstracted_vocab[1]

{'"',
 '",',
 '"/',
 '":',
 '#',
 '(',
 ')',
 ',',
 '-',
 '-0',
 '-07',
 '-2',
 '.',
 '."',
 '/',
 '1',
 '13',
 '2',
 '20',
 '24',
 '38',
 '44',
 '5,',
 '59',
 ':',
 '://',
 ':00',
 '[',
 ']',
 '],',
 '_',
 '```',
 'acts',
 'aek',
 'afe',
 'age',
 'al',
 'als',
 'an',
 'anned',
 'app',
 'art',
 'ash',
 'ass',
 'at',
 'ated',
 'atus',
 'avers',
 'b',
 'bs',
 'bug',
 'c',
 'cher',
 'ci',
 'cl',
 'co',
 'com',
 'created',
 'cs',
 'd',
 'desc',
 'dev',
 'dig',
 'dis',
 'dp',
 'ec',
 'ed',
 'eline',
 'eng',
 'equ',
 'est',
 'ested',
 'ets',
 'f',
 'file',
 'h',
 'hell',
 'https',
 'ial',
 'ic',
 'id',
 'if',
 'in',
 'ing',
 'ings',
 'ins',
 'ip',
 'irect',
 'isco',
 'ition',
 'j',
 'jar',
 'jen',
 'jo',
 'kins',
 'ky',
 'l',
 'ld',
 'let',
 'md',
 'mess',
 'name',
 'ob',
 'okie',
 'or',
 'osure',
 'out',
 'ow',
 'p',
 'part',
 'pat',
 'path',
 'put',
 'q',
 're',
 'red',
 'res',
 'rew',
 'ription',
 'rit',
 'rt',
 's',
 'sac',
 'sc',
 'sec',
 'serv',
 'son',
 'spot',
 'ss',
 'st',
 'static'

In [None]:
m = dict.fromkeys( general2vec.dictionary.token2id.keys(),0 ) #From traceability dataset!

In [None]:
n = general2vec.vocab

In [None]:
len(set(m.keys()))

3010

In [None]:
len(set(n.keys()))

32011

In [None]:
set(m.keys()) - set(n.keys())

set()

In [None]:
set(n.keys()) - set(m.keys())

{'▁locality',
 '▁member',
 'uguese',
 '▁mansfield',
 '▁fun',
 '▁kepler',
 '▁thakur',
 'avian',
 ',"',
 '▁2010:',
 'five',
 '▁fen',
 '▁complaints',
 'seat',
 'ancy',
 '▁damaging',
 '▁wahoo',
 '▁gaia',
 '▁palmi',
 '▁spice',
 '▁brethren',
 'othes',
 '▁qian',
 'oso',
 'ár',
 '▁thanksgiving',
 '▁divorce',
 '▁robert',
 '▁dela',
 'drum',
 '▁historical',
 '▁frum',
 '▁protects',
 '▁1992',
 '▁mughal',
 '▁collingwood',
 '▁pertaining',
 '▁schwe',
 '▁pembrokeshire',
 '▁pixel',
 '▁themed',
 '▁gondwana',
 '▁goalt',
 '▁curt',
 'fasc',
 '▁whig',
 '▁merger',
 '▁abuses',
 'abited',
 '▁guests',
 'opus',
 '▁julie',
 '▁paradise',
 '▁damp',
 'oustic',
 '▁gl',
 '▁arcadia',
 '▁keane',
 '▁colon',
 '▁smash',
 '▁encompassing',
 '▁17-',
 '▁5,000',
 'ayne',
 'mash',
 'graph',
 '▁fei',
 'psons',
 '▁16-',
 '▁mid',
 '▁pages',
 'cdc',
 '▁accommod',
 'poor',
 '▁tipperary',
 '▁ulrich',
 '▁kro',
 'ares',
 '▁concentr',
 '▁watson',
 '▁survivor',
 '▁prevalent',
 '▁hays',
 '▁precision',
 '▁tennis',
 'mond',
 '▁mell',
 '▁couns

In [None]:
len(set(m.keys()) - set(n.keys())) #TODO

0

In [None]:
assert len(set( m.keys()) - set(n.keys())) == 0 

In [None]:
general2vec.documents

[['▁production',
  '▁merge',
  '▁*',
  '▁feed',
  '▁release',
  '▁name',
  '▁through',
  '▁to',
  '▁up',
  'load',
  'b',
  'om',
  '▁(#',
  '29',
  '3',
  '▁)',
  '▁*',
  '▁allow',
  '▁append',
  '▁images',
  '▁(#',
  '28',
  '7',
  '▁)',
  '▁*',
  '▁dare',
  '▁test',
  '▁fields',
  '▁(#',
  '29',
  '4)'],
 ['▁add',
  '▁test',
  '▁fields',
  '▁for',
  '▁dare',
  '▁push',
  '▁*',
  '▁added',
  '▁test',
  '▁data',
  '▁to',
  '▁the',
  '▁j',
  'son',
  '▁being',
  '▁sent',
  '▁to',
  '▁dare',
  '▁when',
  '▁running',
  '▁static',
  '▁code',
  '▁analysis',
  '.',
  '▁example',
  '▁ps',
  'b',
  '▁data',
  '▁below',
  '.',
  '▁closes',
  '▁sac',
  'p',
  '/',
  'cs',
  'b',
  '-',
  'c',
  'ic',
  'dp',
  'ip',
  'eline',
  'ed',
  'ition',
  '#',
  '38',
  '1',
  '▁*',
  '▁added',
  '▁2',
  '▁additional',
  '▁sc',
  'f',
  '▁m',
  'app',
  'ings',
  '▁*',
  '▁[',
  'test',
  '▁builds',
  ']',
  '(',
  'https',
  '://',
  'eng',
  'ci',
  '-',
  'jen',
  'kins',
  '-',
  'rt',
  'p',
  '.'

In [None]:
len(general2vec.dictionary)

3010

In [None]:
general2vec.dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f2fd50763c8>

In [None]:
general2vec.df_all_system.head(1)

Unnamed: 0,ids,text,type,conv,bpe8k,bpe32k,bpe128k
0,295,Production Merge * Feed release name through t...,pr,product merg feed releas name upload bom allow...,"['▁production', '▁mer', 'ge', '▁*', '▁feed', '...","['▁production', '▁merge', '▁*', '▁feed', '▁rel...","['▁production', '▁merge', '▁*', '▁feed', '▁rel..."


In [None]:
general2vec.df_all_system.shape #data final tensor

(362, 7)

In [None]:
#tst for libest
path_to_ground_truth = parameters['path_mappings']
general2vec.ground_truth_processing(path_to_ground_truth)

[('210,', 'test/python/third_party/Corona_Report/license_Report.json')]

In [None]:
#tst for sacp <----- Warning!
#general2vec.ground_truth_processing(parameters['path_mappings'], from_mappings = True)

## 2. Artifacts Similarity with Word2Vec

In [None]:
#export
from collections import Counter
import dit
import math

In [None]:
#export
class Word2VecSeqVect(BasicSequenceVectorization):       
    
    def __init__(self, params, logging):
        super().__init__(params, logging)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest 
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim,
            EntropyMetric.MSI_I: self.msi,
            EntropyMetric.MI: self.mutual_info
        }
    
    def wmd_gensim(self, sentence_a, sentence_b ):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]
    
    def wmd_similarity(self, dist):
        return 1./( 1.+float( dist ) ) #Associated Similarity
    
    def scm_gensim(self, sentence_a, sentence_b ):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)
        
        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True)
        return [1-scm_similarity, scm_similarity]
    
    def msi(self, sentence_a, sentence_b):
        '''@danaderp
        Minimum Shared Information'''
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        self.logging.info('token count processed')
        #Minimum Shared Tokens
        token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in vocab }
        
        alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ]
        frequencies = self.__get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ]
        self.logging.info('frequencies processed')
            
        if not frequencies:
            #"List is empty"
            "nan Means that src and target do not share information at all"
            entropies = float('nan')
            extropies = float('nan')
            self.logging.info('FREQUENCIES NOT COMPUTED!!!<--------------')
        else:
            scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )]
            self.logging.info('scalar_distribution processed')
            
            entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ]
            self.logging.info('entropies processed')
            
            extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ]
            self.logging.info('extropies processed')
        return [entropies,extropies]
    
    def mutual_info(self, sentence_a, sentence_b):
        """ Computing the manifold of metric of information
        Mutual information 
        Joint Information
        Conditioned Information Loss
        Conditioned Information Noise
        Self-Information
        """
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        self.logging.info('token count processed')

        self.logging.info('vocab #'+ str(len(self.vocab.keys())))
        
        alphabet_source = list(set(token_counts_1.keys()))
        self.logging.info('alphabet_source #'+ str(len(alphabet_source)) )
        
        alphabet_target = list(set(token_counts_2.keys()))
        self.logging.info('alphabet_target #'+ str(len(alphabet_target)) )
        
        
        self.logging.info('diff src2tgt #'+ str(set(token_counts_1.keys()) - set(token_counts_2.keys())))
        self.logging.info('diff tgt2src #'+ str(set(token_counts_2.keys()) - set(token_counts_1.keys())))
        
        assert( len(alphabet_source) ==  len(alphabet_target) )
        
        #Computing Self-Information (or Entropy)
        scalar_distribution_source = dit.ScalarDistribution(alphabet_source, self.__get_freqs( token_counts_1 ) )
        entropy_source = dit.shannon.entropy( scalar_distribution_source )
        
        scalar_distribution_target = dit.ScalarDistribution(alphabet_target, self.__get_freqs( token_counts_2 ) )
        entropy_target = dit.shannon.entropy( scalar_distribution_target )
        
        #Computing Joint-information
        token_counts = { token: (token_counts_1[token] + token_counts_2[token]) for token in vocab }
        alphabet = list(set(token_counts.keys()))
        self.logging.info('alphabet #'+ str(len(alphabet)))
        frequencies = self.__get_freqs(token_counts)
        ##WARNING! if a document is empty frequencies might create an issue!
        scalar_distribution = dit.ScalarDistribution(alphabet, frequencies)
        joint_entropy = dit.shannon.entropy( scalar_distribution )
        
        #Computing Mutual-Information
        mutual_information = entropy_source + entropy_target - joint_entropy
        
        #Computing Noise
        noise = joint_entropy - entropy_target
        
        #Computing Loss
        loss = joint_entropy - entropy_source
        
        return [entropy_source, entropy_target, joint_entropy, 
                mutual_information, loss, noise]
    
    def distance(self, metric_list,link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for 
        #the remaining metrics
        ids = self.params['system_path_config']['names'][0]
        txt = self.params['system_path_config']['names'][1]
        
        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            sentence_a = self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0].split()
            sentence_b = self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0].split()
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            sentence_a = eval(self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0])
            sentence_b = eval(self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0])
        
        dist = [ self.dict_distance_dispatcher[metric](sentence_a,sentence_b) for metric in metric_list]
        self.logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    #################################3TODO substitute this block in the future by importing information science module
    def __get_cnts(self, toks, vocab):
        '''@danaderp
        Counts tokens within ONE document'''
        #logging.info("encoding_size:" len
        cnt = Counter(vocab)
        for tok in toks:
            cnt[tok] += 1
        return cnt

    def __get_freqs(self, dict_token_counts):

        num_tokens = sum( dict_token_counts.values() ) #number of subwords inside the document
        if num_tokens == 0.0:
            frequencies = []
            self.logging.info('---------------> NO SHARED INFORMATION <-------------------------')
        else:
            frequencies = [ (dict_token_counts[token])/num_tokens for token in dict_token_counts ]
        return frequencies
    #################################3


In [None]:
#export
def LoadLinks(timestamp, params, logging, grtruth=False, sep=' ' ):
    '''Returns a pandas from a saved link computation at a give timestamp
    @timestamp is the version of the model for a given system'''
    
    path= params['saving_path'] + '['+ params['system'] + '-' + str(params['vectorizationType']) + '-' + str(params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
    
    logging.info("Loading computed links from... "+ path)

    return pd.read_csv(path, header=0, index_col=0, sep=sep)

### Testing Word2Vec SequenceVectorization

In [None]:
#hide
#tst
metric_list = ['a','b']
A = [[1,3,4],[4,5],[1,8,9,7]]
B = ((1,3,4),(4,5),(1,8,9,7))
functools.reduce(lambda a,b : a+b, B)
dist_sim_T = [([12,13],['metric1','metric2']),([12,13],['metric1','metric2'])]
dist_sim_T
separated_merged_list_a = functools.reduce(lambda a,b : a[1]+b[1], dist_sim_T)
separated_merged_list_a

['metric1', 'metric2', 'metric1', 'metric2']

In [None]:
#[step 1]Creating the Vectorization Class
word2vec = Word2VecSeqVect( params = parameters, logging = logging )

2021-01-16 14:51:48,454 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-16 14:51:48,528 : INFO : built Dictionary(3010 unique tokens: ['28', '29', '3', '4)', '7']...) from 362 documents (total 171602 corpus positions)
2021-01-16 14:51:48,752 : INFO : Ignored vocab by BPE{'\r\n\r\n', '`', '\r\n\r\n@', '^', '\r\n', '\t', 'γ', '```', '\r\n\r\n\r\n', '\\', '@'}
2021-01-16 14:51:48,753 : INFO : bpe preprocessing documents, dictionary, and vocab for the test corpus
2021-01-16 14:51:48,754 : INFO : loading Word2Vec object from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model
2021-01-16 14:51:49,323 : INFO : loading wv recursively from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model.wv.* with mmap=None
2021-01-16 14:51:49,324 : INFO : setting ignored attribute vectors_norm to None
2021-01-16 14:51:49,325 : INFO : loading vocabulary recursively from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py

In [None]:
len(word2vec.new_model.wv.vocab)

20284

In [None]:
word2vec.df_source['ids'][35]

'255'

In [None]:
word2vec.df_source['ids'][35] #In LIBEST REQ starts at 35

'255'

In [None]:
ids = parameters['system_path_config']['names'][0]
txt = parameters['system_path_config']['names'][1]
print(ids,txt)

ids bpe32k


In [None]:
idss = word2vec.df_source[ids][35] #Selecting an ID
idss = word2vec.df_source[ids] == idss #Search for an specific ID
list(word2vec.df_source[idss][txt])[0].split() #Retrieving text and splitting

["['▁update',",
 "'▁corona',",
 "'▁bom',",
 "'▁end',",
 "'point',",
 "'▁#',",
 "'25',",
 "'4',",
 "'▁corona',",
 "'▁updated',",
 "'▁change',",
 "'▁the',",
 "'▁api',",
 "'▁to',",
 "'▁get',",
 "'▁the',",
 "'▁bom',",
 "'▁for',",
 "'▁a',",
 "'▁release',",
 "'.',",
 "'▁the',",
 "'▁new',",
 "'▁end',",
 "'point',",
 "'▁is',",
 "'▁',",
 "'`',",
 "'/',",
 "'release',",
 "'/',",
 "':',",
 "'id',",
 "'/',",
 "'aim',",
 "'_',",
 "'b',",
 "'om',",
 "'_',",
 "'re',",
 "'port',",
 "'.',",
 "'j',",
 "'son',",
 "'`',",
 "'.',",
 "'▁changed',",
 "'▁the',",
 "'▁getting',",
 "'▁the',",
 "'▁bom',",
 "'▁to',",
 "'▁use',",
 "'▁the',",
 "'▁new',",
 "'▁end',",
 "'point',",
 "'▁and',",
 "'▁other',",
 "'▁code',",
 "'▁to',",
 "'▁par',",
 "'se',",
 "'▁the',",
 "'▁new',",
 "'▁format',",
 "'.',",
 "'▁the',",
 "'▁new',",
 "'▁end',",
 "'point',",
 "'▁returns',",
 "'▁the',",
 "'▁bom',",
 "'▁format',",
 "'▁differently',",
 "'.',",
 "'▁so',",
 "'▁if',",
 "'▁the',",
 "'▁old',",
 "'▁bom',",
 "'▁reports',",
 "'▁saved',",
 "

In [None]:
word2vec.df_source.head(2)

Unnamed: 0,ids,bpe32k
0,295,"['▁production', '▁merge', '▁*', '▁feed', '▁rel..."
1,294,"['▁add', '▁test', '▁fields', '▁for', '▁dare', ..."


In [None]:
word2vec.df_target.head(2)

Unnamed: 0,ids,bpe32k
1,sacp-python-common/sacp_python_common/auth_uti...,"['▁""""""', '\r\n', 'created', '▁on', '▁aug', '▁1..."
3,sacp-python-common/sacp_python_common/bandit/b...,"['▁#', '!', '/', 'us', 'r', '/', 'bin', '/', '..."


In [None]:
links = word2vec.samplingLinks(sampling=True, samples = 2)
links

[('194', 'sacp-python-common/sacp_python_common/third_party/ipCentralScan.py'),
 ('54', 'sacp-python-common/test/python/custom_scan/test_customScan.py')]

In [None]:
print( len(links), word2vec.df_source.shape, word2vec.df_target.shape )

2 (288, 2) (74, 2)


In [None]:
links[0][0]

'166'

In [None]:
#tst
word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0].split() #conventioanal
#eval(word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0]) #BPE

["['▁maybe',",
 "'▁need',",
 "'▁merge',",
 "'▁(#',",
 "'11',",
 "'0)',",
 "'▁*',",
 "'▁ur',",
 "'ll',",
 "'ib',",
 "'3',",
 "'▁issue',",
 "'▁*',",
 "'▁trial',",
 "'▁2',",
 "'▁*',",
 "'▁test',",
 "'_',",
 "'t',",
 "'rial',",
 "'1',",
 "'▁*',",
 "'▁trial',",
 "'▁3',",
 "'▁*',",
 "'▁more',",
 "'▁tests']"]

In [None]:
#tst
word2vec.df_target[word2vec.df_target[ids].str.contains(links[0][1])][txt].values[0].split()

["['▁#',",
 "'!',",
 "'/',",
 "'us',",
 "'r',",
 "'/',",
 "'bin',",
 "'/',",
 "'en',",
 "'v',",
 "'▁python',",
 "'2',",
 "'\\r\\n',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'#',",
 "'\\r\\n',",
 '\'"\',',
 '\'"\',',
 '\'"\',',
 "'\\r\\n',",
 "'read',",
 "'▁the',",
 "'▁reports',",
 "'▁from',",
 "'▁a',",
 "'▁file',",
 "'▁locat

In [None]:
metric_list = [DistanceMetric.WMD,DistanceMetric.SCM,EntropyMetric.MSI_I,EntropyMetric.MI]
#metric_list = [EntropyMetric.MSI_I,EntropyMetric.MI]

In [None]:
#[optional] computeDistanceMetric Testing [WARNING!] Time Consuming!!
computeDistanceMetric = word2vec.computeDistanceMetric(links, metric_list = metric_list )
computeDistanceMetric

2021-01-16 14:52:17,433 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-16 14:52:17,439 : INFO : built Dictionary(382 unique tokens: ['-', '0)', '1)', '19', '3)']...) from 2 documents (total 3269 corpus positions)
2021-01-16 14:52:17,847 : INFO : token count processed
2021-01-16 14:52:17,874 : INFO : frequencies processed
2021-01-16 14:52:27,484 : INFO : scalar_distribution processed
2021-01-16 14:52:27,485 : INFO : entropies processed
2021-01-16 14:52:27,486 : INFO : extropies processed
2021-01-16 14:52:27,493 : INFO : token count processed
2021-01-16 14:52:27,494 : INFO : vocab #32011
2021-01-16 14:52:27,497 : INFO : alphabet_source #32011
2021-01-16 14:52:27,501 : INFO : alphabet_target #32011
2021-01-16 14:52:27,507 : INFO : diff src2tgt #set()
2021-01-16 14:52:27,513 : INFO : diff tgt2src #set()
2021-01-16 14:52:46,505 : INFO : alphabet #32011
2021-01-16 14:52:55,983 : INFO : Computed distances or similarities ('194', 'sacp-python-common/sacp_python_common/t

([['194',
   'sacp-python-common/sacp_python_common/third_party/ipCentralScan.py',
   1.2426984308419065,
   0.44589142536859094,
   0.9026784226298332,
   0.09732158,
   2.1280852788913944,
   1.2238339714721664,
   4.891851946382239,
   6.8153433747477745,
   6.881665170551628,
   4.825530150578386,
   1.9898132241693887,
   0.06632179580385333],
  ['54',
   'sacp-python-common/test/python/custom_scan/test_customScan.py',
   1.1587324691252452,
   0.4632347983375712,
   0.7911811470985413,
   0.20881885,
   3.9705730958116847,
   1.3904984042298727,
   5.7600413338019525,
   6.6236746347295465,
   6.820964686948631,
   5.562751281582868,
   1.0609233531466788,
   0.1972900522190848]],
 [<DistanceMetric.WMD: 1>,
  <SimilarityMetric.WMD_sim: 1>,
  <DistanceMetric.SCM: 3>,
  <SimilarityMetric.SCM_sim: 3>,
  <EntropyMetric.MSI_I: 1>,
  <EntropyMetric.MSI_X: 2>,
  <EntropyMetric.Entropy_src: 7>,
  <EntropyMetric.Entropy_tgt: 8>,
  <EntropyMetric.JI: 4>,
  <EntropyMetric.MI: 3>,
  <Entropy

In [None]:
#[step 2]NonGroundTruth Computation
word2vec.ComputeDistanceArtifacts( sampling=True, samples = 20, metric_list = metric_list )
word2vec.df_nonground_link.head()

In [None]:
word2vec.df_nonground_link.head()

In [None]:
#tst 
#df_mapping = pd.read_csv(parameters['path_mappings'], header = 0, sep = ',')
#ground_links = word2vec.ground_truth_processing(from_mappings='True') #<---- SACP
ground_links = word2vec.ground_truth_processing(path_to_ground_truth) #<---- LIBEST
ground_links

In [None]:
len(ground_links)

In [None]:
#tst from non-links file
df_x = LoadLinks(timestamp=1610408791.737875, params=parameters,logging=logging)
df_x.head(1)

In [None]:
word2vec.df_ground_link = df_x.copy()
word2vec.df_nonground_link = df_x.copy()

In [None]:
word2vec.df_nonground_link.size

In [None]:
#tst
df_x = word2vec.df_nonground_link
df_x.head()

In [None]:
df_x['Source'].values[11366]

In [None]:
df_x['Target'].values[11366]

In [None]:
test_source = 'UC19E1.txt' #'UC10.TXT'
test_target = '/src/edu/ncsu/csc/itrust/dao/mysql/AuthDAO.java'#'RicercaStandard.java'

In [None]:
#tst
test_source = 'UC23E2.txt' #'UC10.TXT'
test_target = '/WebRoot/util/getUser.jsp'#'RicercaStandard.java'

In [None]:
df_x[ df_x["Target"].str.contains('AuthDAO.java', regex=False) == True]

In [None]:
df_x[ df_x["Source"].str.contains('UC23E2.txt', regex=False) == True]

In [None]:
df_x[( df_x["Source"].str.contains(test_source) ) & (df_x["Target"].str.contains(test_target, regex=False))]

In [None]:
ground_links[0][0]

In [None]:
ground_links[0][1]

In [None]:
#tst
df_x[( df_x["Source"].str.contains(ground_links[0][0]) ) & (df_x["Target"].str.contains(ground_links[0][1], regex=False))]

In [None]:
# Find matching for etour
def find_index_gt( tuple_g ):
    dist = df_x.loc[(df_x["Source"].str.contains( '/' + tuple_g[0]) ) & 
                 (df_x["Target"].str.contains('/' + tuple_g[1], regex=False))]
    print()
    return dist.index.values
#dist

In [None]:
# Find matching for point-based groundtruth itrust/smos
def find_index_gt_point( tuple_g ):
    dist = df_x.loc[(df_x["Source"].str.contains( tuple_g[0]) ) & 
                 (df_x["Target"].str.contains(tuple_g[1], regex=False))]
    return dist.index.values

In [None]:
#Formatted for Semeru mode A
matchGT = [ find_index_gt( g) for g in word2vec.ground_truth_processing(path_to_ground_truth)]
#matchGT = [ find_index_gt_point( g) for g in word2vec.ground_truth_processing(path_to_ground_truth)]
matchGT

In [None]:
df_x[['Source','Target']].iloc[3962][0]

In [None]:
df_x[['Source','Target']].iloc[3962][1]

In [None]:
df_x[['Source','Target']].iloc[3978][0]

In [None]:
df_x[['Source','Target']].iloc[3978][1]

In [None]:
#Formatted for Semeru mode B
matchGT = [ word2vec.findDistInDF( g , semeru_format=True, from_mappings=False ) for g in word2vec.ground_truth_processing(path_to_ground_truth)]
matchGT

In [None]:
#Formatted for SACP
matchGT = [ word2vec.findDistInDF( g , from_mappings=True ) for g in word2vec.ground_truth_processing(from_mappings=True)]
matchGT

In [None]:
matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
matchGT

In [None]:
new_column = pd.Series(np.full([len(matchGT)], 1 ), name=word2vec.params['names'][2], index = matchGT)

In [None]:
new_column

In [None]:
new_column.size

In [None]:
#Some of the mappings are not found in the non-ling list because the mappings have all the ground truth of the issues
#it might include files not take into account in the non-links part
matchGT_ = [ (g,word2vec.findDistInDF( g , from_mappings=True )) for g in word2vec.ground_truth_processing(from_mappings=True)]

In [None]:
matchGT_

In [None]:
len(matchGT)

In [None]:
#[step 3]Saving Non-GroundTruth Links
word2vec.SaveLinks()

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = LoadLinks(timestamp=1608688471.437005, params=parameters, logging = logger)
df_nonglinks.head()

In [None]:
#[step 4]GroundTruthMatching Testing
word2vec.MatchWithGroundTruth(path_to_ground_truth, semeru_format=True)
word2vec.df_ground_link

In [None]:
#[step 4.1]GroundTruthMatching Testing For CISCO Mappings <----- Warning SACP
word2vec.MatchWithGroundTruth(from_mappings=True)
word2vec.df_ground_link

In [None]:
df_z = word2vec.df_ground_link
df_z[~df_z.isin([np.nan, np.inf, -np.inf]).any(1)]

In [None]:
#debug
df_y = word2vec.df_ground_link.copy()
df_y

In [None]:
#debug
df_y.update(new_column)

In [None]:
new_column

In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1]

In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1].shape #Positive Links

In [None]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links[141] # A tuple

In [None]:
#Inspecting Source
ground_links[141][0][:ground_links[141][0].find('.')] + '-'

In [None]:
#Inspecting Target
ground_links[141][1][:ground_links[141][1].find('.')]

In [None]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = LoadLinks(timestamp=1609858614.151381, params=parameters,logging=logging,grtruth = True)
df_glinks.head()

In [None]:
df_glinks[df_glinks["Linked?"] == 0]

## 3. Artifacts Similarity with Doc2Vec

Try to reproduce the same empirical evaluation like here: [link](https://arxiv.org/pdf/1507.07998.pdf). Pay attention to:
- Accuracy vs. Dimensionality (we can replace accuracy for false positive rate or true positive rate)
- Visualize paragraph vectors using t-sne
- Computing Cosine Distance and Similarity. More about similarity [link](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html)

In [None]:
#experiment 0.0.1

In [None]:
path_to_trained_model = path_data+'/models/pv/bpe8k/[doc2vec-Py-Java-PVDBOW-500-20E-8k-1594572857.17191].model'

In [None]:
def doc2vec_params():
    return {
        "vectorizationType": VectorizationType.doc2vec,
        "linkType": LinkType.req2tc,
        "system": 'libest',
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "source_type": SoftwareArtifacts.REQ.value,
        "target_type": SoftwareArtifacts.TC.value,
        "path_to_trained_model": path_to_trained_model,
        "saving_path":  path_data + 'metrics/traceability/experiments0.0.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_model_prefix, #For BPE Analysis
    }

In [None]:
doc2vec_params = doc2vec_params()
doc2vec_params

In [None]:
#Export
class Doc2VecSeqVect(BasicSequenceVectorization):
    
    def __init__(self, params, logging):
        super().__init__(params, logging)
        self.new_model = gensim.models.Doc2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        self.df_inferred_src = None
        self.df_inferred_trg = None
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.EUC: self.euclidean_scipy,
            DistanceMetric.MAN: self.manhattan_scipy
        }
        self.logging.info("d2v loaded")
    
    def distance(self, metric_list, link):
        '''Iterate on the metrics'''
        ν_inferredSource = self.df_inferred_src[self.df_inferred_src['ids'].str.contains(link[0])]['inf-doc2vec'].values[0]
        w_inferredTarget = self.df_inferred_trg[self.df_inferred_trg['ids'].str.contains(link[1])]['inf-doc2vec'].values[0]
        
        dist = [ self.dict_distance_dispatcher[metric](ν_inferredSource,w_inferredTarget) for metric in metric_list]
        self.logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    """
    def computeDistanceMetric(self, links, metric_list):
        '''It is computed the cosine similarity'''
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)
    """
    
    def InferDoc2Vec(self, steps=200):
        '''Activate Inference on Target and Source Corpus'''
        self.df_inferred_src = self.df_source.copy()
        self.df_inferred_trg = self.df_target.copy()
        
        text = self.params['system_path_config']['names'][1]
        self.df_inferred_src['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_src[text].values]
        self.df_inferred_trg['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_trg[text].values]
        
        self.logging.info("Infer Doc2Vec on Source and Target Complete")

### Testing Doc2Vec SequenceVectorization

In [None]:
doc2vec = Doc2VecSeqVect(params = doc2vec_params, logging = logger)

In [None]:
doc2vec.df_source.head(2)

In [None]:
#[step1]Apply Doc2Vec Inference
doc2vec.InferDoc2Vec( steps = 200 )

In [None]:
doc2vec.df_inferred_src.head(2)

In [None]:
len(doc2vec.df_inferred_src['inf-doc2vec'].values[35])

In [None]:
len(doc2vec.df_inferred_src['inf-doc2vec'].values[36])

In [None]:
#test_inferDoc2Vec_trg = inferDoc2Vec(df_target)
#test_inferDoc2Vec_trg.head()
doc2vec.df_inferred_trg.head(2)

In [None]:
#tst correlation
pearsonr(doc2vec.df_inferred_trg['inf-doc2vec'][0], doc2vec.df_inferred_trg['inf-doc2vec'][0])

In [None]:
len(doc2vec.df_inferred_src['inf-doc2vec'])

In [None]:
pearsonr(doc2vec.df_inferred_trg['inf-doc2vec'][0], doc2vec.df_inferred_src['inf-doc2vec'][35])

In [None]:
#[step 2]NonGroundTruth Computation
metric_l = [DistanceMetric.EUC,DistanceMetric.COS,DistanceMetric.MAN, SimilarityMetric.Pearson]
doc2vec.ComputeDistanceArtifacts( sampling=False, samples = 50, metric_list = metric_l )
doc2vec.df_nonground_link.head()

In [None]:
#[step 3]Saving Non-GroundTruth Links
doc2vec.SaveLinks()

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks_doc2vec = LoadLinks(timestamp=1608688610.900933, params=doc2vec_params, logging = logger)
df_nonglinks_doc2vec.head()

In [None]:
#[step 4]GroundTruthMatching Testing
doc2vec.MatchWithGroundTruth(path_to_ground_truth)
doc2vec.df_ground_link

In [None]:
#[step 5]Saving GroundTruth Links
doc2vec.SaveLinks(grtruth = True)

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks_doc2vec = LoadLinks(timestamp=1608688652.964024, params=doc2vec_params, logging = logger, grtruth = True)
df_glinks_doc2vec.head()