In [None]:
# default_exp mining.ir

# Information Retrieval and Traceability Interfaces
> Implementing Common Information Retrieval Interfaces
> Author: @danaderp December 2020

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

In [None]:
! pip install -e .

[31mERROR: File "setup.py" not found. Directory cannot be installed in editable mode: /tf/main/nbs[0m


In [None]:
#export
import numpy as np
import gensim
import pandas as pd
from itertools import product 
from random import sample 
import functools 
import os

In [None]:
#export
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora
from datetime import datetime
from enum import Enum, unique, auto
from ds4se.mgmnt.prep.conv import *

In [None]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Enums

In [None]:
#export
#@unique
class VectorizationType(Enum):
    word2vec = auto()
    doc2vec = auto()
    vsm2vec = auto()

In [None]:
VectorizationType.word2vec

<VectorizationType.word2vec: 1>

In [None]:
#export
#@unique
class DistanceMetric(Enum):
    WMD = auto()
    COS = auto()
    SCM = auto()
    EUC = auto()
    MAN = auto()

In [None]:
#export
#@unique
class SimilarityMetric(Enum):
    WMD_sim = auto()
    COS_sim = auto()
    SCM_sim = auto()
    EUC_sim = auto()
    MAN_sim = auto()
    Pearson = auto()

In [None]:
#export
class EntropyMetric(Enum):
    MSI_I = auto() #Minimum shared information Entropy
    MSI_X = auto() #Minimum shared information Extropy
    MI = auto() #Mutual information
    JI = auto() #Joint information

In [None]:
#export
class SoftwareArtifacts(Enum):
    REQ = 'req'
    TC = 'tc'
    SRC = 'src'
    PY = 'py'
    PR = 'pr'

In [None]:
#export
#@unique
class Preprocessing(Enum):
    conv = auto()
    bpe = auto()

In [None]:
#export
#@unique
class LinkType(Enum):
    req2tc = auto()
    req2src = auto()
    issue2src = auto()
    pr2src = auto()

In [None]:
#tst
LinkType.req2tc

<LinkType.req2tc: 1>

In [None]:
#tst
Preprocessing.bpe

<Preprocessing.bpe: 2>

## 1. Setting-Up Testing Environment

In [None]:
#hide
path_data = '../dvc-ds4se/' #dataset path

In [None]:
#hide
#Experiment 1A (LIBEST)
#path_to_trained_model = '/tf/data/test_data_traceability/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model'
path_to_trained_model = path_data+'models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model'
#path_to_trained_model = path_data+'models/wv/bpe128k/[word2vec-Java-Py-Wiki-SK-500-20E-128k[0]-1594923236.007244].model'
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'

In [None]:
#hide
#Experiment 1 with Libest Conv preprocessing
def libest_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2src,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.REQ.value,
        "target_type": SoftwareArtifacts.TC.value,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "saving_path": path_data + 'se-benchmarking/traceability',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_model_prefix, #For BPE Analysis
        "path_mappings": '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt',    
    }

In [None]:
#hide
parameters = libest_params()
parameters

{'vectorizationType': <VectorizationType.word2vec: 1>,
 'linkType': <LinkType.req2src: 2>,
 'system': 'libest',
 'path_to_trained_model': '../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model',
 'source_type': 'req',
 'target_type': 'tc',
 'system_path_config': {'system_path': '../dvc-ds4se/se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
  'sep': '~',
  'names': ['ids', 'conv'],
  'prep': <Preprocessing.conv: 1>},
 'saving_path': '../dvc-ds4se/se-benchmarking/traceability',
 'names': ['Source', 'Target', 'Linked?'],
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_8k',
 'path_mappings': '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt'}

In [None]:
parameters['source_type']

'req'

In [None]:
#tst
parameters['system_path_config']['system_path']

'../dvc-ds4se/se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv'

In [None]:
#tst
parameters['system_path_config']['names'][1]

'conv'

In [None]:
parameters['system_path_config']['sep'] #tst

'~'

In [None]:
#hide
df_all_system = pd.read_csv(
            parameters['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = parameters['system_path_config']['sep'] 
        )

In [None]:
df_all_system.head(1)

Unnamed: 0,ids,filenames,text,type,conv,bpe128k,bpe32k,bpe8k
0,test_data/LibEST_semeru_format/test/us903.c,us903.c,/*--------------------------------------------...,tc,unit test user stori server simpl enrol august...,"['▁/*', '----------------', '----------------'...","['▁/', '*', '--------', '--------', '--------'...","['▁/', '*', '-', '-', '-', '-', '-', '-', '-',..."


In [None]:
#hide
tag = parameters['system_path_config']['names'][1]
[doc.split() for doc in df_all_system[df_all_system[tag].notnull()][tag].values]

[['unit',
  'test',
  'user',
  'stori',
  'server',
  'simpl',
  'enrol',
  'august',
  'copyright',
  'cisco',
  'system',
  'inc',
  'right',
  'reserv',
  'includ',
  'stdio',
  'ifndef',
  'win',
  'includ',
  'unistd',
  'endif',
  'includ',
  'est',
  'includ',
  'curl',
  'curl',
  'includ',
  'curl',
  'util',
  'includ',
  'test',
  'util',
  'includ',
  'server',
  'includ',
  'openssl',
  'ssl',
  'ifdef',
  'cunit',
  'includ',
  'cunit',
  'basic',
  'includ',
  'cunit',
  'autom',
  'endif',
  'ifndef',
  'win',
  'static',
  'char',
  'test',
  'outfil',
  'filenam',
  'max',
  'test',
  'hdr',
  'defin',
  'cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'explicit_cert',
  'us903',
  'cert',
  'pem',
  'defin',
  'us903_explicit_key',
  'us903',
  'key',
  'pem',
  'defin',
  'us903_cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'us903_trusted_cert',
  'trustedcert',
  'crt',
  'defin',
  'est',
  'privat',
  'estservercertandkey',
  'pem',
  'els',
  'static'

In [None]:
len(df_all_system[tag].values) #tst

87

In [None]:
#tst
len(df_all_system[df_all_system[tag].notnull()]) #some files are _init_ thefore are empty

87

In [None]:
#tst
df_all_system[df_all_system[tag].notnull()][tag].values

array(['unit test user stori server simpl enrol august copyright cisco system inc right reserv includ stdio ifndef win includ unistd endif includ est includ curl curl includ curl util includ test util includ server includ openssl ssl ifdef cunit includ cunit basic includ cunit autom endif ifndef win static char test outfil filenam max test hdr defin cacert est cacert crt defin explicit_cert us903 cert pem defin us903_explicit_key us903 key pem defin us903_cacert est cacert crt defin us903_trusted_cert trustedcert crt defin est privat estservercertandkey pem els static char test5_outfil filename_max us903 test5 hdr defin us903_cacert est cacert crt defin us903_explicit_cert us903 cert pem defin us903_explicit_key us903 key pem defin us903_cacert est cacert crt defin us903_trusted_cert trustedcert crt defin est privat estservercertandkey pem endif static unsign char cacert null static int cacerts_len defin us903_retry_interv 3600 defin us903_tcp_port 29001 follow csr generat use follow o

In [None]:
#tst
df_all_system.loc[df_all_system['type'] == parameters['source_type']][parameters['system_path_config']['names']]

Unnamed: 0,ids,conv
35,test_data/LibEST_semeru_format/requirements/RQ...,requir http uri control est server must suppor...
36,test_data/LibEST_semeru_format/requirements/RQ...,requir server side key generat respons request...
37,test_data/LibEST_semeru_format/requirements/RQ...,requir http base client authent est server may...
38,test_data/LibEST_semeru_format/requirements/RQ...,requir csr attribut request est client request...
39,test_data/LibEST_semeru_format/requirements/RQ...,requir server side key generat est client may ...
40,test_data/LibEST_semeru_format/requirements/RQ...,requir client author decis issu certif client ...
41,test_data/LibEST_semeru_format/requirements/RQ...,requir csr attribut polici may allow inclus cl...
42,test_data/LibEST_semeru_format/requirements/RQ...,requir simpl enrol client https post simpleenr...
43,test_data/LibEST_semeru_format/requirements/RQ...,requir csr attribut follow exampl valid csratt...
44,test_data/LibEST_semeru_format/requirements/RQ...,requir http layer http use transfer est messag...


In [None]:
df_all_system.loc[df_all_system['type'] == parameters['target_type']][parameters['system_path_config']['names']]

Unnamed: 0,ids,conv
0,test_data/LibEST_semeru_format/test/us903.c,unit test user stori server simpl enrol august...
1,test_data/LibEST_semeru_format/test/us3496.c,unit test uri path segment extens support marc...
2,test_data/LibEST_semeru_format/test/us899.c,unit test user stori client simpl enrol septem...
3,test_data/LibEST_semeru_format/test/us4020.c,unit test user stori unit test client proxi mo...
4,test_data/LibEST_semeru_format/test/us897.c,unit test user stori client cacert june copyri...
5,test_data/LibEST_semeru_format/test/us1060.c,unit test user stori tls srp support server pr...
6,test_data/LibEST_semeru_format/test/us900.c,unit test user stori server csr attribut novem...
7,test_data/LibEST_semeru_format/test/us896.c,unit test user stori client csr attribut novem...
8,test_data/LibEST_semeru_format/test/us894.c,unit test user stori proxi cacert novemb copyr...
9,test_data/LibEST_semeru_format/test/us1005.c,unit test user stori client easi provis novemb...


## Defining BasicSequenceVectorization

In [None]:
#tst
print(list(VectorizationType), list(DistanceMetric), list(SimilarityMetric), list(LinkType))

[<VectorizationType.word2vec: 1>, <VectorizationType.doc2vec: 2>, <VectorizationType.vsm2vec: 3>] [<DistanceMetric.WMD: 1>, <DistanceMetric.COS: 2>, <DistanceMetric.SCM: 3>, <DistanceMetric.EUC: 4>, <DistanceMetric.MAN: 5>] [<SimilarityMetric.WMD_sim: 1>, <SimilarityMetric.COS_sim: 2>, <SimilarityMetric.SCM_sim: 3>, <SimilarityMetric.EUC_sim: 4>, <SimilarityMetric.MAN_sim: 5>, <SimilarityMetric.Pearson: 6>] [<LinkType.req2tc: 1>, <LinkType.req2src: 2>, <LinkType.issue2src: 3>, <LinkType.pr2src: 4>]


In [None]:
#export
class BasicSequenceVectorization():
    '''Implementation of the class sequence-vanilla-vectorization other classes can inheritance this one'''
    def __init__(self, params):
                
        self.params = params
        self.df_nonground_link = None
        self.df_ground_link = None
        self.prep = ConventionalPreprocessing(self.params, bpe = True)
        
        self.df_all_system = pd.read_csv(
            self.params['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = self.params['system_path_config']['sep'] 
        )
        
        #self.df_source = pd.read_csv(params['source_path'], names=['ids', 'text'], header=None, sep=' ')
        #self.df_target = pd.read_csv(params['target_path'], names=['ids', 'text'], header=None, sep=' ')
        self.df_source = self.df_all_system.loc[self.df_all_system['type'] == self.params['source_type']][self.params['system_path_config']['names']]
        self.df_target = self.df_all_system.loc[self.df_all_system['type'] == self.params['target_type']][self.params['system_path_config']['names']]
        
        #NA verification
        tag = self.params['system_path_config']['names'][1]
        self.df_source[tag] = self.df_source[tag].fillna("")
        self.df_target[tag] = self.df_target[tag].fillna("")
        
        ## self.document and self.dictionary is the vocabulary of the traceability corpus
        ## Do not confuse it with the dictionary of the general vectorization model
        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            self.documents = [doc.split() for doc in self.df_all_system[self.df_all_system[tag].notnull()][tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            self.vocab = dict.fromkeys( self.dictionary.token2id.keys(),0 )
            logging.info("conventional preprocessing documents, dictionary, and vocab for the test corpus")
        
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            self.documents = [eval(doc) for doc in self.df_all_system[tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            self.computing_bpe_vocab()
            logging.info("bpe preprocessing documents, dictionary, and vocab for the test corpus")
        
        
        #This can be extended for future metrics <---------------------
        #TODO include mutual and join information
        self.dict_labels = {
            DistanceMetric.COS:[DistanceMetric.COS, SimilarityMetric.COS_sim],
            SimilarityMetric.Pearson:[SimilarityMetric.Pearson],
            DistanceMetric.EUC:[DistanceMetric.EUC, SimilarityMetric.EUC_sim],
            DistanceMetric.WMD:[DistanceMetric.WMD, SimilarityMetric.WMD_sim],
            DistanceMetric.SCM:[DistanceMetric.SCM, SimilarityMetric.SCM_sim],
            DistanceMetric.MAN:[DistanceMetric.MAN, SimilarityMetric.MAN_sim],
            EntropyMetric.MSI_I:[EntropyMetric.MSI_I, EntropyMetric.MSI_X],
            EntropyMetric.MI:[EntropyMetric.JI, EntropyMetric.MI]
        }

    def computing_bpe_vocab(self):
        ####INFO science params
        #TODO generalize bpe8k parameter
        abstracted_vocab = [ set(doc) for doc in self.df_all_system[ 'bpe8k' ].values] #creation of sets
        abstracted_vocab = functools.reduce( lambda a,b : a.union(b), abstracted_vocab ) #union of sets
        self.vocab = {self.prep.sp_bpe.id_to_piece(id): 0 for id in range(self.prep.sp_bpe.get_piece_size())}
        dict_abs_vocab = { elem : 0 for elem in abstracted_vocab - set(self.vocab.keys()) } #Ignored vocab by BPE
        self.vocab.update(dict_abs_vocab) #Updating
    
    def ground_truth_processing(self, path_to_ground_truth = '', from_mappings = False):
        'Optional class when corpus has ground truth. This function create tuples of links'
        
        if from_mappings:
            df_mapping = pd.read_csv(self.params['path_mappings'], header = 0, sep = ',')
            ground_links = list(zip(df_mapping['id_pr'].astype(str), df_mapping['doc_id']))
        else:
            ground_truth = open(path_to_ground_truth,'r')
            #Organizing The Ground Truth under the given format
            ground_links = [ [(line.strip().split()[0], elem) for elem in line.strip().split()[1:]] for line in ground_truth]
            ground_links = functools.reduce(lambda a,b : a+b,ground_links) #reducing into one list
            assert len(ground_links) ==  len(set(ground_links)) #To Verify Redundancies in the file
        return ground_links
    
    def samplingLinks(self, sampling = False, samples = 10, basename = False):
        
        if basename:
            source = [os.path.basename(elem) for elem in self.df_source['ids'].values ] 
            target = [os.path.basename(elem) for elem in self.df_target['ids'].values ]
        else:
            source = self.df_source['ids'].values
            target = self.df_target['ids'].values

        if sampling:
            links = sample( list( product( source , target ) ), samples)
        else:
            links = list( product( source , target ))

        return links
    
    def cos_scipy(self, vector_v, vector_w):
        cos =  distance.cosine( vector_v, vector_w )
        return [cos, 1.-cos]
    
    def euclidean_scipy(self, vector_v, vector_w):
        dst = distance.euclidean(vector_v,vector_w)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def manhattan_scipy(self, vector_v, vector_w):
        dst = distance.cityblock(vector_v,vector_w)
        n = len(vector_v)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def pearson_abs_scipy(self, vector_v, vector_w):
        '''We are not sure that pearson correlation works well on doc2vec inference vectors'''
        #vector_v =  np.asarray(vector_v, dtype=np.float32)
        #vector_w =  np.asarray(vector_w, dtype=np.float32)
        logging.info("pearson_abs_scipy" + str(vector_v) + "__" + str(vector_w))
        corr, _ = pearsonr(vector_v, vector_w)
        return [abs(corr)] #Absolute value of the correlation
    

    def computeDistanceMetric(self, links, metric_list):
        '''Metric List Iteration''' 
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)
    
    def ComputeDistanceArtifacts(self, metric_list, sampling = False , samples = 10, basename = False):
        '''Activates Distance and Similarity Computations
        @metric_list if [] then Computes All metrics
        @sampling is False by the default
        @samples is the number of samples (or links) to be generated'''
        links_ = self.samplingLinks( sampling, samples, basename )
        
        docs, metric_labels = self.computeDistanceMetric( metric_list=metric_list, links=links_) #checkpoints
        self.df_nonground_link = pd.DataFrame(docs, columns =[self.params['names'][0], self.params['names'][1]]+ metric_labels) #Transforming into a Pandas
        logging.info("Non-groundtruth links computed")
        pass 
    
    
    def SaveLinks(self, grtruth=False, sep=' ', mode='a'):
        timestamp = datetime.timestamp(datetime.now())
        path_to_link = self.params['saving_path'] + '['+ self.params['system'] + '-' + str(self.params['vectorizationType']) + '-' + str(self.params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
        
        if grtruth:
            self.df_ground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        else:
            self.df_nonground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        
        logging.info('Saving in...' + path_to_link)
        pass
    
    def findDistInDF(self, g_tuple, from_mappings=False, semeru_format=False):
        '''Return the index values of the matched mappings
        .eq is used for Source since it must match the exact code to avoid number substrings
        for the target, the substring might works fine'''

        if from_mappings: #SACP Format
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].eq(g_tuple[0]) ) & 
                 (self.df_ground_link["Target"].str.contains(g_tuple[1], regex=False))]
            logging.info('findDistInDF: from_mappings')
        elif semeru_format: #LibEST Format
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].str.contains(g_tuple[0], regex=False) ) & 
                 (self.df_ground_link["Target"].str.contains(g_tuple[1], regex=False))]
            logging.info('findDistInDF: semeru_format')
        else: #By Default use Semeru Format
            dist = self.df_ground_link[self.df_ground_link[self.params['names'][0]].str.contains( g_tuple[0][:g_tuple[0].find('.')] + '-' ) 
                     & self.df_ground_link[self.params['names'][1]].str.contains(g_tuple[1][:g_tuple[1].find('.')]) ]
            logging.info('findDistInDF: default')
        return dist.index.values
    
        
    def MatchWithGroundTruth(self, path_to_ground_truth='', from_mappings=False, semeru_format=False ):
        self.df_ground_link = self.df_nonground_link.copy()
        self.df_ground_link[self.params['names'][2]] = 0
        
        matchGT = [ self.findDistInDF( g , from_mappings=from_mappings, semeru_format=semeru_format ) for g in self.ground_truth_processing(path_to_ground_truth,from_mappings)]
        matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
        new_column = pd.Series(np.full([len(matchGT)], 1 ), name=self.params['names'][2], index = matchGT)
        
        self.df_ground_link.update(new_column)
        logging.info("Groundtruth links computed")
        pass

### Testing BasicSequenceVectorization

In [None]:
general2vec =  BasicSequenceVectorization(params = parameters)

2020-12-16 01:19:40,421 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 01:19:40,477 : INFO : built Dictionary(6957 unique tokens: ['");', '"../../', '("\\', '();', ')))']...) from 87 documents (total 88944 corpus positions)
2020-12-16 01:19:40,478 : INFO : conventional preprocessing documents, dictionary, and vocab for the test corpus


In [None]:
m = dict.fromkeys( general2vec.dictionary.token2id.keys(),0 ) #From traceability dataset!

In [None]:
n = general2vec.vocab

In [None]:
len(set(m.keys()) - set(n.keys())) #TODO

0

In [None]:
assert len(set( m.keys()) - set(n.keys())) == 0 

In [None]:
general2vec.documents

[['unit',
  'test',
  'user',
  'stori',
  'server',
  'simpl',
  'enrol',
  'august',
  'copyright',
  'cisco',
  'system',
  'inc',
  'right',
  'reserv',
  'includ',
  'stdio',
  'ifndef',
  'win',
  'includ',
  'unistd',
  'endif',
  'includ',
  'est',
  'includ',
  'curl',
  'curl',
  'includ',
  'curl',
  'util',
  'includ',
  'test',
  'util',
  'includ',
  'server',
  'includ',
  'openssl',
  'ssl',
  'ifdef',
  'cunit',
  'includ',
  'cunit',
  'basic',
  'includ',
  'cunit',
  'autom',
  'endif',
  'ifndef',
  'win',
  'static',
  'char',
  'test',
  'outfil',
  'filenam',
  'max',
  'test',
  'hdr',
  'defin',
  'cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'explicit_cert',
  'us903',
  'cert',
  'pem',
  'defin',
  'us903_explicit_key',
  'us903',
  'key',
  'pem',
  'defin',
  'us903_cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'us903_trusted_cert',
  'trustedcert',
  'crt',
  'defin',
  'est',
  'privat',
  'estservercertandkey',
  'pem',
  'els',
  'static'

In [None]:
len(general2vec.dictionary)

6957

In [None]:
general2vec.dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f88e1482d68>

In [None]:
general2vec.df_all_system.head(1)

Unnamed: 0,ids,filenames,text,type,conv,bpe128k,bpe32k,bpe8k
0,test_data/LibEST_semeru_format/test/us903.c,us903.c,/*--------------------------------------------...,tc,unit test user stori server simpl enrol august...,"['▁/*', '----------------', '----------------'...","['▁/', '*', '--------', '--------', '--------'...","['▁/', '*', '-', '-', '-', '-', '-', '-', '-',..."


In [None]:
general2vec.df_all_system.shape #data final tensor

(87, 8)

In [None]:
#tst for libest
path_to_ground_truth = parameters['path_mappings']
general2vec.ground_truth_processing(path_to_ground_truth)

[('RQ4.txt', 'us1864.c'),
 ('RQ4.txt', 'us901.c'),
 ('RQ4.txt', 'us1005.c'),
 ('RQ4.txt', 'us3512.c'),
 ('RQ4.txt', 'us895.c'),
 ('RQ4.txt', 'us897.c'),
 ('RQ4.txt', 'us900.c'),
 ('RQ6.txt', 'us1005.c'),
 ('RQ6.txt', 'us1159.c'),
 ('RQ6.txt', 'us3496.c'),
 ('RQ6.txt', 'us3512.c'),
 ('RQ6.txt', 'us3612.c'),
 ('RQ6.txt', 'us4020.c'),
 ('RQ6.txt', 'us748.c'),
 ('RQ6.txt', 'us893.c'),
 ('RQ6.txt', 'us895.c'),
 ('RQ6.txt', 'us896.c'),
 ('RQ6.txt', 'us897.c'),
 ('RQ6.txt', 'us898.c'),
 ('RQ6.txt', 'us899.c'),
 ('RQ6.txt', 'us900.c'),
 ('RQ8.txt', 'us1005.c'),
 ('RQ8.txt', 'us1159.c'),
 ('RQ8.txt', 'us1883.c'),
 ('RQ8.txt', 'us2174.c'),
 ('RQ8.txt', 'us3496.c'),
 ('RQ8.txt', 'us3512.c'),
 ('RQ8.txt', 'us3612.c'),
 ('RQ8.txt', 'us4020.c'),
 ('RQ8.txt', 'us748.c'),
 ('RQ8.txt', 'us893.c'),
 ('RQ8.txt', 'us895.c'),
 ('RQ8.txt', 'us896.c'),
 ('RQ8.txt', 'us897.c'),
 ('RQ8.txt', 'us898.c'),
 ('RQ8.txt', 'us899.c'),
 ('RQ8.txt', 'us900.c'),
 ('RQ11.txt', 'us1159.c'),
 ('RQ11.txt', 'us1883.c'),
 ('R

In [None]:
#tst for sacp <----- Warning!
#general2vec.ground_truth_processing(parameters['path_mappings'], from_mappings = True)

## Artifacts Similarity with Word2Vec

In [None]:
#export
from collections import Counter
import dit
import math

In [None]:
#export
class Word2VecSeqVect(BasicSequenceVectorization):       
    
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest 
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim,
            EntropyMetric.MSI_I: self.msi,
            EntropyMetric.MI: self.mutual_info
        }
    
    def wmd_gensim(self, sentence_a, sentence_b ):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]
    
    def wmd_similarity(self, dist):
        return 1./( 1.+float( dist ) ) #Associated Similarity
    
    def scm_gensim(self, sentence_a, sentence_b ):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)
        
        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True)
        return [1-scm_similarity, scm_similarity]
    
    def msi(self, sentence_a, sentence_b):
        '''@danaderp
        Minimum Shared Information'''
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        logging.info('token count processed')
        #Minimum Shared Tokens
        #TODO create an if down to include Joint Entropy by summing token_counts_1 and token_counts_2
        token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in vocab }
        
        alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ]
        frequencies = self.__get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ]
        logging.info('frequencies processed')
            
        if not frequencies:
            #"List is empty"
            entropies = float('nan')
            extropies = float('nan')
        else:
            scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )]
            logging.info('scalar_distribution processed')
            
            entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ]
            logging.info('entropies processed')
            
            extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ]
            logging.info('extropies processed')
        return [entropies,extropies]
    
    def mutual_info(self, sentence_a, sentence_b):
        """Mutual information and Joint Information"""
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        logging.info('token count processed')


        #TODO verify redundancies in the alphabet
        alphabet_source = list(set(token_counts_1.keys()))
        logging.info('alphabet_source #'+ str(len(alphabet_source)))
        alphabet_target = list(set(token_counts_2.keys()))
        logging.info('alphabet_target #'+ str(len(alphabet_target)))
        
        logging.info('vocab #'+ str(len(self.vocab.keys())))
        logging.info('diff #'+ str(set(token_counts_1.keys()) - set(token_counts_2.keys())))
        #Computing Self-Information (or Entropy)
        scalar_distribution_source = dit.ScalarDistribution(alphabet_source, self.__get_freqs( token_counts_1 ) )
        entropy_source = dit.shannon.entropy( scalar_distribution_source )
        
        scalar_distribution_target = dit.ScalarDistribution(alphabet_target, self.__get_freqs( token_counts_2 ) )
        entropy_target = dit.shannon.entropy( scalar_distribution_target )
        
        #Computing Joint-information
        token_counts = { token: (token_counts_1[token] + token_counts_2[token]) for token in vocab }
        alphabet = list(set(token_counts.keys()))
        logging.info('alphabet #'+ str(len(alphabet)))
        frequencies = self.__get_freqs(token_counts)
        ##WARNING! if a document is empty frequencies might create an issue!
        scalar_distribution = dit.ScalarDistribution(alphabet, frequencies)
        joint_entropy = dit.shannon.entropy( scalar_distribution )
        
        mutual_information = entropy_source + entropy_target - joint_entropy
        return [joint_entropy, mutual_information]
    
    #ToDo Mutual information
    
    def distance(self, metric_list,link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for 
        #the remaining metrics
        ids = self.params['system_path_config']['names'][0]
        txt = self.params['system_path_config']['names'][1]
        
        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            sentence_a = self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0].split()
            sentence_b = self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0].split()
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            sentence_a = eval(self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0])
            sentence_b = eval(self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0])
        
        dist = [ self.dict_distance_dispatcher[metric](sentence_a,sentence_b) for metric in metric_list]
        logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    #################################3TODO substitute this block in the future by importing information science module
    def __get_cnts(self, toks, vocab):
        '''@danaderp
        Counts tokens within ONE document'''
        #logging.info("encoding_size:" len
        cnt = Counter(vocab)
        for tok in toks:
            cnt[tok] += 1
        return cnt

    def __get_freqs(self, dict_token_counts):

        num_tokens = sum( dict_token_counts.values() ) #number of subwords inside the document
        if num_tokens == 0.0:
            frequencies = []
            logging.info('---------------> NO SHARED INFORMATION <-------------------------')
        else:
            frequencies = [ (dict_token_counts[token])/num_tokens for token in dict_token_counts ]
        return frequencies
    #################################3


In [None]:
#export
def LoadLinks(timestamp, params, grtruth=False, sep=' ' ):
    '''Returns a pandas from a saved link computation at a give timestamp
    @timestamp is the version of the model for a given system'''
    
    path= params['saving_path'] + '['+ params['system'] + '-' + str(params['vectorizationType']) + '-' + str(params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
    
    logging.info("Loading computed links from... "+ path)

    return pd.read_csv(path, header=0, index_col=0, sep=sep)

### Testing Word2Vec SequenceVectorization

In [None]:
#hide
#tst
metric_list = ['a','b']
A = [[1,3,4],[4,5],[1,8,9,7]]
B = ((1,3,4),(4,5),(1,8,9,7))
functools.reduce(lambda a,b : a+b, B)
dist_sim_T = [([12,13],['metric1','metric2']),([12,13],['metric1','metric2'])]
dist_sim_T
separated_merged_list_a = functools.reduce(lambda a,b : a[1]+b[1], dist_sim_T)
separated_merged_list_a

['metric1', 'metric2', 'metric1', 'metric2']

In [None]:
#[step 1]Creating the Vectorization Class
word2vec = Word2VecSeqVect( params = parameters )

2020-12-16 01:22:28,694 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 01:22:28,750 : INFO : built Dictionary(6957 unique tokens: ['");', '"../../', '("\\', '();', ')))']...) from 87 documents (total 88944 corpus positions)
2020-12-16 01:22:28,752 : INFO : conventional preprocessing documents, dictionary, and vocab for the test corpus
2020-12-16 01:22:28,752 : INFO : loading Word2Vec object from ../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model
2020-12-16 01:22:28,814 : INFO : loading wv recursively from ../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model.wv.* with mmap=None
2020-12-16 01:22:28,815 : INFO : loading vectors from ../dvc-ds4se/models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model.wv.vectors.npy with mmap=None
2020-12-16 01:22:28,842 : INFO : setting ignored attribute vectors_norm to None
2020-12-16 01:22:28,843 : INFO : loading vocabulary recursi

In [None]:
len(word2vec.new_model.wv.vocab)

39159

In [None]:
word2vec.df_source['ids'][35]

'test_data/LibEST_semeru_format/requirements/RQ17.txt'

In [None]:
word2vec.df_source['ids'][35] #In LIBEST REQ starts at 35

'test_data/LibEST_semeru_format/requirements/RQ17.txt'

In [None]:
ids = parameters['system_path_config']['names'][0]
txt = parameters['system_path_config']['names'][1]
print(ids,txt)

ids conv


In [None]:
idss = word2vec.df_source[ids][35] #Selecting an ID
idss = word2vec.df_source[ids] == idss #Search for an specific ID
list(word2vec.df_source[idss][txt])[0].split() #Retrieving text and splitting

['requir',
 'http',
 'uri',
 'control',
 'est',
 'server',
 'must',
 'support',
 'use',
 'path',
 'prefix',
 'well',
 'known',
 'defin',
 'rfc',
 'regist',
 'name',
 'est',
 'thus',
 'valid',
 'est',
 'server',
 'uri',
 'path',
 'begin',
 'https',
 'www',
 'exampl',
 'com',
 'well',
 'known',
 'est',
 'est',
 'oper',
 'indic',
 'path',
 'suffix',
 'indic',
 'intend',
 'oper',
 'oper',
 'correspond',
 'uri',
 'oper',
 'oper',
 'path',
 'detail',
 'distribut',
 'cacert',
 'section',
 'certif',
 'must',
 'enrol',
 'simpleenrol',
 'section',
 'client',
 'must',
 'enrol',
 'simplereenrol',
 'section',
 'client',
 'must',
 'full',
 'cmc',
 'option',
 'fullcmc',
 'section',
 'server',
 'side',
 'key',
 'serverkeygen',
 'section',
 'generat',
 'option',
 'csr',
 'attribut',
 'csrattr',
 'section',
 'option',
 'figur',
 'oper',
 'path',
 'figur',
 'append',
 'path',
 'prefix',
 'form',
 'uri',
 'use',
 'http',
 'get',
 'post',
 'perform',
 'desir',
 'est',
 'oper',
 'exampl',
 'valid',
 'uri',


In [None]:
word2vec.df_source.head(2)

Unnamed: 0,ids,conv
35,test_data/LibEST_semeru_format/requirements/RQ...,requir http uri control est server must suppor...
36,test_data/LibEST_semeru_format/requirements/RQ...,requir server side key generat respons request...


In [None]:
word2vec.df_target.head(2)

Unnamed: 0,ids,conv
0,test_data/LibEST_semeru_format/test/us903.c,unit test user stori server simpl enrol august...
1,test_data/LibEST_semeru_format/test/us3496.c,unit test uri path segment extens support marc...


In [None]:
links = word2vec.samplingLinks(sampling=True, samples = 2)
links

[('test_data/LibEST_semeru_format/requirements/RQ33.txt',
  'test_data/LibEST_semeru_format/test/us893.c'),
 ('test_data/LibEST_semeru_format/requirements/RQ15.txt',
  'test_data/LibEST_semeru_format/test/us897.c')]

In [None]:
print( len(links), word2vec.df_source.shape, word2vec.df_target.shape )

2 (52, 2) (21, 2)


In [None]:
links[0][0]

'test_data/LibEST_semeru_format/requirements/RQ33.txt'

In [None]:
#tst
word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0].split() #conventioanal
#eval(word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0]) #BPE

['requir',
 'certif',
 'request',
 'est',
 'client',
 'request',
 'est',
 'databas',
 'inform',
 'form',
 'certif',
 'https',
 'get',
 'messag',
 'use',
 'oper',
 'path',
 'cacert',
 'est',
 'client',
 'server',
 'must',
 'support',
 'cacert',
 'function',
 'client',
 'request',
 'date',
 'respons',
 'store',
 'inform',
 'expir',
 'order',
 'ensur',
 'est',
 'databas',
 'date',
 'est',
 'server',
 'requir',
 'client',
 'authent',
 'author',
 'repli',
 'request',
 'client',
 'must',
 'authent',
 'est',
 'server',
 'specifi',
 'section',
 'certif',
 'base',
 'authent',
 'use',
 'section',
 'option',
 'certif',
 'less',
 'authent',
 'use',
 'check',
 'server',
 'author',
 'given',
 'section',
 'follow',
 'procedur',
 'outlin',
 'section']

In [None]:
#tst
word2vec.df_target[word2vec.df_target[ids].str.contains(links[0][1])][txt].values[0].split()

['unit',
 'test',
 'user',
 'stori',
 'proxi',
 'reenrol',
 'octob',
 'copyright',
 'cisco',
 'system',
 'inc',
 'right',
 'reserv',
 'includ',
 'stdio',
 'ifndef',
 'win',
 'includ',
 'unistd',
 'endif',
 'includ',
 'est',
 'includ',
 'curl',
 'curl',
 'includ',
 'curl',
 'util',
 'includ',
 'test',
 'util',
 'includ',
 'server',
 'includ',
 'proxi',
 'includ',
 'openssl',
 'ssl',
 'ifdef',
 'cunit',
 'includ',
 'cunit',
 'basic',
 'includ',
 'cunit',
 'autom',
 'endif',
 'static',
 'unsign',
 'char',
 'cacert',
 'null',
 'static',
 'int',
 'cacert',
 'len',
 'defin',
 'tcp',
 'server',
 'port',
 'defin',
 'tcp',
 'proxi',
 'port',
 '093',
 'follow',
 'csr',
 'generat',
 'use',
 'follow',
 'openssl',
 'command',
 'use',
 'cat',
 'rsa',
 'req',
 'file',
 'openssl',
 'req',
 'newkey',
 'rsa',
 '2048',
 'keyout',
 'rsakey',
 'pem',
 'keyform',
 'pem',
 'rsa',
 'req',
 'outform',
 'pem',
 'defin',
 'us893_pkcs10_rsa2048',
 'miicv',
 'tccaa',
 'ucaqaw',
 'delmak',
 'ga1uebh',
 'mcvvmx',
 '

In [None]:
metric_list = [DistanceMetric.WMD,DistanceMetric.SCM,EntropyMetric.MSI_I,EntropyMetric.MI]
#metric_list = [EntropyMetric.MSI_I,EntropyMetric.MI]

In [None]:
#[optional] computeDistanceMetric Testing [WARNING!] Time Consuming!!
computeDistanceMetric = word2vec.computeDistanceMetric(links, metric_list = metric_list )
computeDistanceMetric

2020-12-16 01:25:58,014 : INFO : Removed 21 and 1444 OOV words from document 1 and 2 (respectively).
2020-12-16 01:25:58,016 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 01:25:58,017 : INFO : built Dictionary(166 unique tokens: ['author', 'base', 'check', 'client', 'date']...) from 2 documents (total 1070 corpus positions)
2020-12-16 01:25:58,123 : INFO : token count processed
2020-12-16 01:25:58,129 : INFO : frequencies processed
2020-12-16 01:25:58,776 : INFO : scalar_distribution processed
2020-12-16 01:25:58,777 : INFO : entropies processed
2020-12-16 01:25:58,778 : INFO : extropies processed
2020-12-16 01:25:58,780 : INFO : token count processed
2020-12-16 01:25:58,781 : INFO : alphabet_source #6957
2020-12-16 01:25:58,783 : INFO : alphabet_target #6957
2020-12-16 01:25:58,784 : INFO : vocab #6957
2020-12-16 01:25:58,785 : INFO : diff #set()
2020-12-16 01:26:00,047 : INFO : alphabet #6957
2020-12-16 01:26:00,670 : INFO : Computed distances or similarit

([['test_data/LibEST_semeru_format/requirements/RQ33.txt',
   'test_data/LibEST_semeru_format/test/us893.c',
   1.0538697177539689,
   0.48688579969598106,
   0.7183125019073486,
   0.2816875,
   3.897203151424737,
   1.3817285328240057,
   7.953092053436972,
   4.892734528401327],
  ['test_data/LibEST_semeru_format/requirements/RQ15.txt',
   'test_data/LibEST_semeru_format/test/us897.c',
   1.0805234894665,
   0.480648262354599,
   0.82513128221035,
   0.17486872,
   2.7464393446710154,
   1.297089487030139,
   6.914002597516406,
   5.096688463148617]],
 [<DistanceMetric.WMD: 1>,
  <SimilarityMetric.WMD_sim: 1>,
  <DistanceMetric.SCM: 3>,
  <SimilarityMetric.SCM_sim: 3>,
  <EntropyMetric.MSI_I: 1>,
  <EntropyMetric.MSI_X: 2>,
  <EntropyMetric.JI: 4>,
  <EntropyMetric.MI: 3>])

In [None]:
#[step 2]NonGroundTruth Computation
word2vec.ComputeDistanceArtifacts( sampling=True, samples = 5, metric_list = metric_list )
word2vec.df_nonground_link.head()

2020-12-16 01:28:11,970 : INFO : Removed 60 and 1444 OOV words from document 1 and 2 (respectively).
2020-12-16 01:28:11,971 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-16 01:28:11,974 : INFO : built Dictionary(189 unique tokens: ['add', 'address', 'arc', 'attribut', 'author']...) from 2 documents (total 1121 corpus positions)
2020-12-16 01:28:12,211 : INFO : token count processed
2020-12-16 01:28:12,216 : INFO : frequencies processed
2020-12-16 01:28:12,845 : INFO : scalar_distribution processed
2020-12-16 01:28:12,846 : INFO : entropies processed
2020-12-16 01:28:12,847 : INFO : extropies processed
2020-12-16 01:28:12,849 : INFO : token count processed
2020-12-16 01:28:12,850 : INFO : alphabet_source #6957
2020-12-16 01:28:12,852 : INFO : alphabet_target #6957
2020-12-16 01:28:12,853 : INFO : vocab #6957
2020-12-16 01:28:12,854 : INFO : diff #set()
2020-12-16 01:28:14,118 : INFO : alphabet #6957
2020-12-16 01:28:14,745 : INFO : Computed distances or similar

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674


In [None]:
word2vec.df_nonground_link.head()

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674


In [None]:
#tst 
#df_mapping = pd.read_csv(parameters['path_mappings'], header = 0, sep = ',')
#ground_links = word2vec.ground_truth_processing(from_mappings='True') #<---- SACP
ground_links = word2vec.ground_truth_processing(path_to_ground_truth) #<---- LIBEST
ground_links

[('RQ4.txt', 'us1864.c'),
 ('RQ4.txt', 'us901.c'),
 ('RQ4.txt', 'us1005.c'),
 ('RQ4.txt', 'us3512.c'),
 ('RQ4.txt', 'us895.c'),
 ('RQ4.txt', 'us897.c'),
 ('RQ4.txt', 'us900.c'),
 ('RQ6.txt', 'us1005.c'),
 ('RQ6.txt', 'us1159.c'),
 ('RQ6.txt', 'us3496.c'),
 ('RQ6.txt', 'us3512.c'),
 ('RQ6.txt', 'us3612.c'),
 ('RQ6.txt', 'us4020.c'),
 ('RQ6.txt', 'us748.c'),
 ('RQ6.txt', 'us893.c'),
 ('RQ6.txt', 'us895.c'),
 ('RQ6.txt', 'us896.c'),
 ('RQ6.txt', 'us897.c'),
 ('RQ6.txt', 'us898.c'),
 ('RQ6.txt', 'us899.c'),
 ('RQ6.txt', 'us900.c'),
 ('RQ8.txt', 'us1005.c'),
 ('RQ8.txt', 'us1159.c'),
 ('RQ8.txt', 'us1883.c'),
 ('RQ8.txt', 'us2174.c'),
 ('RQ8.txt', 'us3496.c'),
 ('RQ8.txt', 'us3512.c'),
 ('RQ8.txt', 'us3612.c'),
 ('RQ8.txt', 'us4020.c'),
 ('RQ8.txt', 'us748.c'),
 ('RQ8.txt', 'us893.c'),
 ('RQ8.txt', 'us895.c'),
 ('RQ8.txt', 'us896.c'),
 ('RQ8.txt', 'us897.c'),
 ('RQ8.txt', 'us898.c'),
 ('RQ8.txt', 'us899.c'),
 ('RQ8.txt', 'us900.c'),
 ('RQ11.txt', 'us1159.c'),
 ('RQ11.txt', 'us1883.c'),
 ('R

In [None]:
len(ground_links)

352

In [None]:
#tst
df_x = word2vec.df_nonground_link
df_x.head()

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674


In [None]:
df_x['Source'].values

array(['test_data/LibEST_semeru_format/requirements/RQ50.txt',
       'test_data/LibEST_semeru_format/requirements/RQ16.txt',
       'test_data/LibEST_semeru_format/requirements/RQ1.txt',
       'test_data/LibEST_semeru_format/requirements/RQ18.txt',
       'test_data/LibEST_semeru_format/requirements/RQ31.txt'],
      dtype=object)

In [None]:
#tst
test_source = 'RQ50.txt'
test_target = 'us893.c'
df_x[( df_x["Source"].str.contains(test_source) ) & (df_x["Target"].str.contains(test_target, regex=False))]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706


In [None]:
ground_links[0][0]

'RQ4.txt'

In [None]:
ground_links[0][1]

'us1864.c'

In [None]:
#tst
df_x[( df_x["Source"].str.contains(ground_links[0][0]) ) & (df_x["Target"].str.contains(ground_links[0][1], regex=False))]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI


In [None]:
def find_index_gt( tuple_g ):
    dist = df_x.loc[(df_x["Source"].str.eq(tuple_g[0]) ) & 
                 (df_x["Target"].str.contains(tuple_g[1], regex=False))]
    return dist.index.values
#dist

In [None]:
#Formatted for SACP
matchGT = [ word2vec.findDistInDF( g , from_mappings=True ) for g in word2vec.ground_truth_processing(from_mappings=True)]
matchGT

KeyError: 'id_pr'

In [None]:
matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
matchGT

NameError: name 'matchGT' is not defined

In [None]:
new_column = pd.Series(np.full([len(matchGT)], 1 ), name=word2vec.params['names'][2], index = matchGT)

NameError: name 'matchGT' is not defined

In [None]:
new_column

In [None]:
#Some of the mappings are not found in the non-ling list because the mappings have all the ground truth of the issues
#it might include files not take into account in the non-links part
matchGT_ = [ (g,word2vec.findDistInDF( g , from_mappings=True )) for g in word2vec.ground_truth_processing(from_mappings=True)]

In [None]:
matchGT_

In [None]:
len(matchGT)

In [None]:
#[step 3]Saving Non-GroundTruth Links
word2vec.SaveLinks()

2020-12-16 01:41:24,508 : INFO : Saving in...../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-False-1608082884.501101].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = LoadLinks(timestamp=1608082884.501101, params=parameters)
df_nonglinks.head()

2020-12-16 01:41:52,578 : INFO : Loading computed links from... ../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-False-1608082884.501101].csv


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674


In [None]:
#[step 4]GroundTruthMatching Testing
word2vec.MatchWithGroundTruth(path_to_ground_truth, semeru_format=True)
word2vec.df_ground_link

2020-12-16 01:42:21,703 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,706 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,710 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,713 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,716 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,719 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,722 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,725 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,727 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,729 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,732 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,734 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,736 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,738 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,741 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,743 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:21,745 

2020-12-16 01:42:22,027 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,029 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,031 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,034 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,036 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,038 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,040 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,042 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,045 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,047 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,049 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,052 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,055 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,058 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,060 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,063 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,065 

2020-12-16 01:42:22,332 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,334 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,335 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,337 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,339 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,341 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,343 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,345 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,349 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,351 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,354 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,357 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,360 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,363 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,365 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,367 : INFO : findDistInDF: semeru_format
2020-12-16 01:42:22,369 

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706,1.0
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712,1.0
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561,0.0
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845,0.0
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674,0.0


In [None]:
#[step 4.1]GroundTruthMatching Testing For CISCO Mappings <----- Warning SACP
word2vec.MatchWithGroundTruth(from_mappings=True)
word2vec.df_ground_link

In [None]:
df_z = word2vec.df_ground_link
df_z[~df_z.isin([np.nan, np.inf, -np.inf]).any(1)]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706,1.0
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712,1.0
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561,0.0
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845,0.0
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674,0.0


In [None]:
#debug
df_y = word2vec.df_ground_link.copy()
df_y

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706,1.0
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712,1.0
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561,0.0
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845,0.0
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674,0.0


In [None]:
#debug
df_y.update(new_column)

NameError: name 'new_column' is not defined

In [None]:
new_column

NameError: name 'new_column' is not defined

In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706,1.0
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712,1.0


In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1].shape #Positive Links

(2, 11)

In [None]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links[141] # A tuple

('RQ33.txt', 'us894.c')

In [None]:
#Inspecting Source
ground_links[141][0][:ground_links[141][0].find('.')] + '-'

'RQ33-'

In [None]:
#Inspecting Target
ground_links[141][1][:ground_links[141][1].find('.')]

'us894'

In [None]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

2020-12-16 01:45:52,310 : INFO : Saving in...../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-True-1608083152.307332].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = LoadLinks(timestamp=1608083152.307332, params=parameters,grtruth = True)
df_glinks.head()

2020-12-16 01:46:08,271 : INFO : Loading computed links from... ../dvc-ds4se/se-benchmarking/traceability[libest-VectorizationType.word2vec-LinkType.req2src-True-1608083152.307332].csv


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
0,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.054974,0.486624,0.792439,0.207561,4.892534,1.415386,8.075343,6.159706,1.0
1,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1060.c,0.99641,0.500899,0.776058,0.223942,4.272593,1.39561,7.754694,5.789712,1.0
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561,0.0
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845,0.0
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674,0.0


In [None]:
df_glinks[df_glinks["Linked?"] == 0]

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,EntropyMetric.MSI_I,EntropyMetric.MSI_X,EntropyMetric.JI,EntropyMetric.MI,Linked?
2,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us893.c,1.024607,0.493923,0.743049,0.256951,4.53135,1.397691,8.060381,5.936561,0.0
3,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us1005.c,0.977207,0.505764,0.629305,0.370695,3.756079,1.359441,7.460108,5.440845,0.0
4,test_data/LibEST_semeru_format/requirements/RQ...,test_data/LibEST_semeru_format/test/us2174.c,1.165569,0.461773,0.792941,0.207059,2.921928,1.335902,9.231067,3.376674,0.0
