# Neural Unsupervised Approaches for SE Traceability [approach]

> This module is dedicated to evaluate word2vec/doc2vec or any neural unsupervised approaches on traceability datasets. Consider to Copy the entire notebook for a new and separeted empirical evaluation. 
> Implementing mutual information analysis
> Author: @danaderp April 2020
> Author: @danielrc Nov 2020

This copy is for Cisco purposes. It was adapted to process private github data from cisco. 

In [None]:
import numpy as np
import gensim
import pandas as pd
from itertools import product 
from random import sample 
import functools 
import os
from enum import Enum, unique, auto

In [None]:
#export
from datetime import datetime
import seaborn as sns

In [None]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
#export
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from pandas.plotting import lag_plot
import math as m
import random as r
import collections
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#export
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora

In [None]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
#export
from scipy.spatial import distance
from scipy.stats import pearsonr

In [None]:
#export
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
from pathlib import Path

In [None]:
import ds4se as ds

In [None]:
from ds4se.mgmnt.prep.conv import *

# Artifacts Similarity with BasicSequenceVectorization

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

In [None]:
#export
#@unique
class VectorizationType(Enum):
    word2vec = auto()
    doc2vec = auto()
    vsm2vec = auto()

In [None]:
VectorizationType.word2vec

In [None]:
#export
#@unique
class LinkType(Enum):
    req2tc = auto()
    req2src = auto()
    issue2src = auto()
    pr2src = auto()

In [None]:
#export
#@unique
class DistanceMetric(Enum):
    WMD = auto()
    COS = auto()
    SCM = auto()
    EUC = auto()
    MAN = auto()

In [None]:
#export
#@unique
class SimilarityMetric(Enum):
    WMD_sim = auto()
    COS_sim = auto()
    SCM_sim = auto()
    EUC_sim = auto()
    MAN_sim = auto()
    Pearson = auto()

In [None]:
class EntropyMetric(Enum):
    MSI_I = auto() #Minimum shared information Entropy
    MSI_X = auto() #Minimum shared information Extropy
    MI = auto() #Mutual information
    JI = auto() #Joint information

In [None]:
class SoftwareArtifacts(Enum):
    REQ = auto()
    TC = auto()
    SRC = auto()

In [None]:
#export
#@unique
class Preprocessing(Enum):
    conv = auto()
    bpe = auto()

In [None]:
LinkType.req2tc

In [None]:
Preprocessing.bpe

# Experients Set-up

In [None]:
path_data = '../dvc-ds4se/' #dataset path

In [None]:

path_to_trained_model = path_data+'models/wv/bpe128k/[word2vec-Java-Py-SK-500-20E-128k-1594873397.267055].model'
#path_to_trained_model = path_data/'models/wv/bpe128k/[word2vec-Java-Py-Wiki-SK-500-20E-128k[15]-1595189771.501188].model'

In [None]:
#Testing default params
def default_params():
    return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2tc,
        "system": 'sacp',
        "path_to_trained_model": path_to_trained_model,
        "source_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-req].csv',
        #"target_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-tc].csv',
        #"system_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv',
        "saving_path": path_data +'se-benchmarking/traceability',
        "names": ['Source','Target','Linked?']
    }

In [None]:
#Experiment 1 with Libest Conv preprocessing
def libest_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2src,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.REQ,
        "target_type": SoftwareArtifacts.TC,
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "saving_path": path_data + 'se-benchmarking/traceability',
        "names": ['Source','Target','Linked?']
    }

In [None]:
#Experiment 2 with Libest BPE preprocessing
def libest_params_bpe():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2src,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_type": 'req', #TODO Standardize the artifacts 
        "target_type": 'tc',
        #"path_mappings": 'cisco/libest_data/sacp-pr-mappings.csv',
        "system_path_config": {
            "system_path": path_data + 'se-benchmarking/traceability/cisco/libest_data/[libest-all-corpus-1596063103.098236].csv',
            "sep": '~',
            "names": ['ids','bpe128k'],
            "prep": Preprocessing.bpe
        },
        "saving_path": path_data + 'se-benchmarking/traceability',
        "names": ['Source','Target','Linked?']
    }

In [None]:
#CISCO GitHub Parameters
def sacp_params():
    return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_data + 'models/wv/conv/[word2vec-Py-Java-Wiki-SK-500-20E[0]-1592979270.711115].model',
        "source_type": 'pr', #TODO Standardize the artifacts 
        "target_type": 'py',
        "path_mappings": '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1596383717.992744].csv', #MUST have bpe8k <----
            "sep": '~',
            "names": ['ids','conv'],
            "prep": Preprocessing.conv
        },
        "saving_path":  path_data/'se-benchmarking/traceability/cisco/sacp',
        "names": ['Source','Target','Linked?']
    }

In [None]:
path_to_trained_model = path_data + 'models/wv/bpe8k/[word2vec-Java-Py-Wiki-SK-500-20E-8k[12]-1594546477.788739].model'

In [None]:
def sacp_params_bpe():
    return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_to_trained_model,
        "source_type": 'pr', #TODO Standardize the artifacts 
        "target_type": 'py',
        "path_mappings": '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1596383717.992744].csv',
            "sep": '~',
            "names": ['ids','bpe8k'],
            "prep": Preprocessing.bpe
        },
        "saving_path": path_data + 'se-benchmarking/traceability/cisco/sacp',
        "names": ['Source','Target','Linked?'],
        "model_prefix":path_data + 'models/bpe/sentencepiece/wiki_py_java_bpe_8k' #For BPE Analysis
    }

In [None]:
#parameters = default_params()
#parameters = libest_params()
#parameters = _params()
parameters = sacp_params_bpe()
#parameters = libest_params_bpe()
parameters

### Testing experiments set-up

In [None]:
#tst
parameters['system_path_config']['system_path']

In [None]:
#tst
parameters['system_path_config']['names'][1]

In [None]:
parameters['system_path_config']['sep'] #tst

In [None]:
#tst
df_all_system = pd.read_csv(
            parameters['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = parameters['system_path_config']['sep'] 
        )

In [None]:
df_all_system.head(1)

In [None]:
#tst
tag = parameters['system_path_config']['names'][1]
[doc.split() for doc in df_all_system[df_all_system[tag].notnull()][tag].values]

In [None]:
len(df_all_system[tag].values) #tst

In [None]:
#tst
len(df_all_system[df_all_system[tag].notnull()]) #some files are _init_ thefore are empty

In [None]:
#tst
df_all_system[df_all_system[tag].notnull()][tag].values

In [None]:
#tst
df_all_system.loc[df_all_system['type'] == parameters['source_type']][parameters['system_path_config']['names']]

In [None]:
df_all_system.loc[df_all_system['type'] == parameters['target_type']][parameters['system_path_config']['names']]

### Defining BasicSequenceVectorization

In [None]:
#tst
print(list(VectorizationType), list(DistanceMetric), list(SimilarityMetric), list(LinkType))

In [None]:
#export
class BasicSequenceVectorization():
    '''Implementation of the class sequence-vanilla-vectorization other classes can inheritance this one'''
    def __init__(self, params):
                
        self.params = params
        self.df_nonground_link = None
        self.df_ground_link = None
        self.prep = ConventionalPreprocessing(params, bpe = True)
        
        self.df_all_system = pd.read_csv(
            params['system_path_config']['system_path'], 
            #names = params['system_path_config']['names'], #include the names into the files!!!
            header = 0, 
            index_col = 0, 
            sep = params['system_path_config']['sep'] 
        )
        
        #self.df_source = pd.read_csv(params['source_path'], names=['ids', 'text'], header=None, sep=' ')
        #self.df_target = pd.read_csv(params['target_path'], names=['ids', 'text'], header=None, sep=' ')
        self.df_source = self.df_all_system.loc[self.df_all_system['type'] == params['source_type']][params['system_path_config']['names']]
        self.df_target = self.df_all_system.loc[self.df_all_system['type'] == params['target_type']][params['system_path_config']['names']]
        
        #NA verification
        tag = parameters['system_path_config']['names'][1]
        self.df_source[tag] = self.df_source[tag].fillna("")
        self.df_target[tag] = self.df_target[tag].fillna("")
        
        if params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            self.documents = [doc.split() for doc in self.df_all_system[self.df_all_system[tag].notnull()][tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            logging.info("conventional preprocessing documents and dictionary")
        elif params['system_path_config']['prep'] == Preprocessing.bpe:
            self.documents = [eval(doc) for doc in self.df_all_system[tag].values] #Preparing Corpus
            self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
            logging.info("bpe preprocessing documents and dictionary")
            
        ####INFO science params
        abstracted_vocab = [ set(doc) for doc in self.df_all_system[ 'bpe8k' ].values] #creation of sets
        abstracted_vocab = functools.reduce( lambda a,b : a.union(b), abstracted_vocab ) #union of sets
        self.vocab = {self.prep.sp_bpe.id_to_piece(id): 0 for id in range(self.prep.sp_bpe.get_piece_size())}
        dict_abs_vocab = { elem : 0 for elem in abstracted_vocab - set(self.vocab.keys()) } #Ignored vocab by BPE
        self.vocab.update(dict_abs_vocab) #Updating
        
        
        #This can be extended for future metrics <---------------------
        #TODO include mutual and join information
        self.dict_labels = {
            DistanceMetric.COS:[DistanceMetric.COS, SimilarityMetric.COS_sim],
            SimilarityMetric.Pearson:[SimilarityMetric.Pearson],
            DistanceMetric.EUC:[DistanceMetric.EUC, SimilarityMetric.EUC_sim],
            DistanceMetric.WMD:[DistanceMetric.WMD, SimilarityMetric.WMD_sim],
            DistanceMetric.SCM:[DistanceMetric.SCM, SimilarityMetric.SCM_sim],
            DistanceMetric.MAN:[DistanceMetric.MAN, SimilarityMetric.MAN_sim],
            EntropyMetric.MSI_I:[EntropyMetric.MSI_I, EntropyMetric.MSI_X],
            EntropyMetric.MI:[EntropyMetric.JI, EntropyMetric.MI]
        }

        
    def ground_truth_processing(self, path_to_ground_truth = '', from_mappings = False):
        'Optional class when corpus has ground truth. This function create tuples of links'
        
        if from_mappings:
            df_mapping = pd.read_csv(self.params['path_mappings'], header = 0, sep = ',')
            ground_links = list(zip(df_mapping['id_pr'].astype(str), df_mapping['doc_id']))
        else:
            ground_truth = open(path_to_ground_truth,'r')
            #Organizing The Ground Truth under the given format
            ground_links = [ [(line.strip().split()[0], elem) for elem in line.strip().split()[1:]] for line in ground_truth]
            ground_links = functools.reduce(lambda a,b : a+b,ground_links) #reducing into one list
            assert len(ground_links) ==  len(set(ground_links)) #To Verify Redundancies in the file
        return ground_links
    
    def samplingLinks(self, sampling = False, samples = 10, basename = False):
        
        if basename:
            source = [os.path.basename(elem) for elem in self.df_source['ids'].values ] 
            target = [os.path.basename(elem) for elem in self.df_target['ids'].values ]
        else:
            source = self.df_source['ids'].values
            target = self.df_target['ids'].values

        if sampling:
            links = sample( list( product( source , target ) ), samples)
        else:
            links = list( product( source , target ))

        return links
    
    def cos_scipy(self, vector_v, vector_w):
        cos =  distance.cosine( vector_v, vector_w )
        return [cos, 1.-cos]
    
    def euclidean_scipy(self, vector_v, vector_w):
        dst = distance.euclidean(vector_v,vector_w)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def manhattan_scipy(self, vector_v, vector_w):
        dst = distance.cityblock(vector_v,vector_w)
        n = len(vector_v)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def pearson_abs_scipy(self, vector_v, vector_w):
        '''We are not sure that pearson correlation works well on doc2vec inference vectors'''
        #vector_v =  np.asarray(vector_v, dtype=np.float32)
        #vector_w =  np.asarray(vector_w, dtype=np.float32)
        logging.info("pearson_abs_scipy" + str(vector_v) + "__" + str(vector_w))
        corr, _ = pearsonr(vector_v, vector_w)
        return [abs(corr)] #Absolute value of the correlation
    

    def computeDistanceMetric(self, links, metric_list):
        '''Metric List Iteration''' 
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)
    
    def ComputeDistanceArtifacts(self, metric_list, sampling = False , samples = 10, basename = False):
        '''Activates Distance and Similarity Computations
        @metric_list if [] then Computes All metrics
        @sampling is False by the default
        @samples is the number of samples (or links) to be generated'''
        links_ = self.samplingLinks( sampling, samples, basename )
        
        docs, metric_labels = self.computeDistanceMetric( metric_list=metric_list, links=links_) #checkpoints
        self.df_nonground_link = pd.DataFrame(docs, columns =[self.params['names'][0], self.params['names'][1]]+ metric_labels) #Transforming into a Pandas
        logging.info("Non-groundtruth links computed")
        pass 
    
    
    def SaveLinks(self, grtruth=False, sep=' ', mode='a'):
        timestamp = datetime.timestamp(datetime.now())
        path_to_link = self.params['saving_path'] + '['+ self.params['system'] + '-' + str(self.params['vectorizationType']) + '-' + str(self.params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
        
        if grtruth:
            self.df_ground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        else:
            self.df_nonground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        
        logging.info('Saving in...' + path_to_link)
        pass
    
    def findDistInDF(self, g_tuple, from_mappings=False, semeru_format=False):
        '''Return the index values of the matched mappings
        .eq is used for Source since it must match the exact code to avoid number substrings
        for the target, the substring might works fine'''

        if from_mappings:
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].eq(g_tuple[0]) ) & 
                 (self.df_ground_link["Target"].str.contains(g_tuple[1], regex=False))]
            logging.info('findDistInDF: from_mappings')
        elif semeru_format:
            dist = self.df_ground_link.loc[(self.df_ground_link["Source"].str.contains(g_tuple[0], regex=False) ) & 
                 (self.df_ground_link["Target"].str.contains(g_tuple[1], regex=False))]
            logging.info('findDistInDF: semeru_format')
        else:
            dist = self.df_ground_link[self.df_ground_link[self.params['names'][0]].str.contains( g_tuple[0][:g_tuple[0].find('.')] + '-' ) 
                     & self.df_ground_link[self.params['names'][1]].str.contains(g_tuple[1][:g_tuple[1].find('.')]) ]
            logging.info('findDistInDF: default')
        return dist.index.values
    
        
    def MatchWithGroundTruth(self, path_to_ground_truth='', from_mappings=False, semeru_format=False ):
        self.df_ground_link = self.df_nonground_link.copy()
        self.df_ground_link[self.params['names'][2]] = 0
        
        matchGT = [ self.findDistInDF( g , from_mappings=from_mappings, semeru_format=semeru_format ) for g in self.ground_truth_processing(path_to_ground_truth,from_mappings)]
        matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
        new_column = pd.Series(np.full([len(matchGT)], 1 ), name=self.params['names'][2], index = matchGT)
        
        self.df_ground_link.update(new_column)
        logging.info("Groundtruth links computed")
        pass

### Testing BasicSequenceVectorization

In [None]:
general2vec =  BasicSequenceVectorization(params = parameters)

In [None]:
general2vec.vocab

In [None]:
general2vec.documents

In [None]:
general2vec.dictionary

In [None]:
general2vec.df_all_system.head(1)

In [None]:
general2vec.df_all_system.shape #data final tensor

In [None]:
#tst for libest
path_to_ground_truth = '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt'
general2vec.ground_truth_processing(path_to_ground_truth)

In [None]:
#tst for sacp
general2vec.ground_truth_processing(parameters['path_mappings'], from_mappings = True)

# Artifacts Similarity with Word2Vec

In [None]:
from collections import Counter
import dit

In [None]:
import math

In [None]:
#export
class Word2VecSeqVect(BasicSequenceVectorization):       
    
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest 
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim,
            EntropyMetric.MSI_I: self.msi,
            EntropyMetric.MI: self.mutual_info
        }
    
    def wmd_gensim(self, sentence_a, sentence_b ):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]
    
    def wmd_similarity(self, dist):
        return 1./( 1.+float( dist ) ) #Associated Similarity
    
    def scm_gensim(self, sentence_a, sentence_b ):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)
        
        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True)
        return [1-scm_similarity, scm_similarity]
    
    def msi(self, sentence_a, sentence_b):
        '''@danaderp
        Minimum Shared Information'''
        token_counts_1 = self.get_cnts(sentence_a, self.vocab)
        token_counts_2 = self.get_cnts(sentence_b, self.vocab)
        logging.info('token count processed')
        #Minimum Shared Tokens
        #TODO create an if down to include Joint Entropy by summing token_counts_1 and token_counts_2
        token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in self.vocab }
        
        alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ]
        frequencies = self.get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ]
        logging.info('frequencies processed')
            
        if not frequencies:
            #"List is empty"
            entropies = float('nan')
            extropies = float('nan')
        else:
            scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )]
            logging.info('scalar_distribution processed')
            
            entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ]
            logging.info('entropies processed')
            
            extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ]
            logging.info('extropies processed')
        return [entropies,extropies]
    
    def mutual_info(self, sentence_a, sentence_b):
        """Mutual information and Joint Information"""
        token_counts_1 = self.get_cnts(sentence_a, self.vocab)
        token_counts_2 = self.get_cnts(sentence_b, self.vocab)
        logging.info('token count processed')


        #TODO verify redundancies in the alphabet
        alphabet_source = list(set(token_counts_1.keys()))
        logging.info('alphabet_source #'+ str(len(alphabet_source)))
        alphabet_target = list(set(token_counts_2.keys()))
        logging.info('alphabet_target #'+ str(len(alphabet_target)))
        
        logging.info('vocab #'+ str(len(self.vocab.keys())))
        logging.info('diff #'+ str(set(token_counts_1.keys()) - set(token_counts_2.keys())))
        #Computing Self-Information (or Entropy)
        scalar_distribution_source = dit.ScalarDistribution(alphabet_source, self.get_freqs( token_counts_1 ) )
        entropy_source = dit.shannon.entropy( scalar_distribution_source )
        
        scalar_distribution_target = dit.ScalarDistribution(alphabet_target, self.get_freqs( token_counts_2 ) )
        entropy_target = dit.shannon.entropy( scalar_distribution_target )
        
        #Computing Joint-information
        token_counts = { token: (token_counts_1[token] + token_counts_2[token]) for token in self.vocab }
        alphabet = list(set(token_counts.keys()))
        logging.info('alphabet #'+ str(len(alphabet)))
        frequencies = self.get_freqs(token_counts)
        ##WARNING! if a document is empty frequencies might create an issue!
        scalar_distribution = dit.ScalarDistribution(alphabet, frequencies)
        joint_entropy = dit.shannon.entropy( scalar_distribution )
        
        mutual_information = entropy_source + entropy_target - joint_entropy
        return [joint_entropy, mutual_information]
    
    #ToDo Mutual information
    
    def distance(self, metric_list,link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for 
        #the remaining metrics
        ids = parameters['system_path_config']['names'][0]
        txt = parameters['system_path_config']['names'][1]
        
        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            sentence_a = self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0].split()
            sentence_b = self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0].split()
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            sentence_a = eval(self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0])
            sentence_b = eval(self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0])
        
        dist = [ self.dict_distance_dispatcher[metric](sentence_a,sentence_b) for metric in metric_list]
        logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    #################################3TODO substitute this block in the future by importing information science module
    def get_cnts(self, toks, vocab):
        '''@danaderp
        Counts tokens within ONE document'''
        #logging.info("encoding_size:" len
        cnt = Counter(vocab)
        for tok in toks:
            cnt[tok] += 1
        return cnt

    def get_freqs(self, dict_token_counts):

        num_tokens = sum( dict_token_counts.values() ) #number of subwords inside the document
        if num_tokens == 0.0:
            frequencies = []
            logging.info('---------------> NO SHARED INFORMATION <-------------------------')
        else:
            frequencies = [ (dict_token_counts[token])/num_tokens for token in dict_token_counts ]
        return frequencies
    #################################3


### Testing BasicSequenceVectorization

In [None]:
#export
def LoadLinks(timestamp, params, grtruth=False, sep=' ' ):
    '''Returns a pandas from a saved link computation at a give timestamp
    @timestamp is the version of the model for a given system'''
    
    path= params['saving_path'] + '['+ params['system'] + '-' + str(params['vectorizationType']) + '-' + str(params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
    
    logging.info("Loading computed links from... "+ path)

    return pd.read_csv(path, header=0, index_col=0, sep=sep)

### Testing Word2Vec SequenceVectorization

In [None]:
#hide
#tst
metric_list = ['a','b']
A = [[1,3,4],[4,5],[1,8,9,7]]
B = ((1,3,4),(4,5),(1,8,9,7))
functools.reduce(lambda a,b : a+b, B)
dist_sim_T = [([12,13],['metric1','metric2']),([12,13],['metric1','metric2'])]
dist_sim_T
separated_merged_list_a = functools.reduce(lambda a,b : a[1]+b[1], dist_sim_T)
separated_merged_list_a

In [None]:
#[step 1]Creating the Vectorization Class
word2vec = Word2VecSeqVect( params = parameters )

In [None]:
len(word2vec.new_model.wv.vocab)

In [None]:
word2vec.df_source['ids'][0]

In [None]:
ids = parameters['system_path_config']['names'][0]
txt = parameters['system_path_config']['names'][1]
print(ids,txt)

In [None]:
idss = word2vec.df_source[ids][35] #Selecting an ID
idss = word2vec.df_source[ids] == idss #Search for an specific ID
list(word2vec.df_source[idss][txt])[0].split() #Retrieving text and splitting

In [None]:
word2vec.df_source.head()

In [None]:
word2vec.df_target.head()

In [None]:
links = word2vec.samplingLinks(sampling=True, samples = 2)
links

In [None]:
print( len(links), word2vec.df_source.shape, word2vec.df_target.shape )

In [None]:
links[0][0]

In [None]:
#tst
#word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0].split() #conventioanal
eval(word2vec.df_source[word2vec.df_source[ids].str.contains(links[0][0])][txt].values[0]) #BPE

In [None]:
#tst
word2vec.df_target[word2vec.df_target[ids].str.contains(links[0][1])][txt].values[0].split()

### Running Example and Experimentation

In [None]:
#metric_list = [DistanceMetric.WMD,DistanceMetric.SCM,EntropyMetric.MSI_I]
metric_list = [EntropyMetric.MSI_I,EntropyMetric.MI]

In [None]:
#[optional] computeDistanceMetric Testing [WARNING!] Time Consuming!!
computeDistanceMetric = word2vec.computeDistanceMetric(links, metric_list = metric_list )
computeDistanceMetric

In [None]:
#[step 2]NonGroundTruth Computation
word2vec.ComputeDistanceArtifacts( sampling=False, samples = 5, metric_list = metric_list )
word2vec.df_nonground_link.head()

In [None]:
word2vec.df_nonground_link.head()

In [None]:
#tst
#df_mapping = pd.read_csv(parameters['path_mappings'], header = 0, sep = ',')
ground_links = word2vec.ground_truth_processing(from_mappings='True')
ground_links

In [None]:
len(ground_links)

In [None]:
#tst
df_x = word2vec.df_nonground_link
df_x.head()

In [None]:
ground_links[0][0]

In [None]:
#tst
df_x[(df_x["Source"].eq(ground_links[0][0]) ) & (df_x["Target"].str.contains(ground_links[0][1], regex=False))]

In [None]:
def find_index_gt( tuple_g ):
    dist = df_x.loc[(df_x["Source"].str.eq(tuple_g[0]) ) & 
                 (df_x["Target"].str.contains(tuple_g[1], regex=False))]
    return dist.index.values
#dist

In [None]:
matchGT = [ word2vec.findDistInDF( g , from_mappings=True ) for g in word2vec.ground_truth_processing(from_mappings=True)]
matchGT

In [None]:
matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT) #Concatenate indexes
matchGT

In [None]:
new_column = pd.Series(np.full([len(matchGT)], 1 ), name=word2vec.params['names'][2], index = matchGT)

In [None]:
new_column

In [None]:
#Some of the mappings are not found in the non-ling list because the mappings have all the ground truth of the issues
#it might include files not take into account in the non-links part
matchGT_ = [ (g,word2vec.findDistInDF( g , from_mappings=True )) for g in word2vec.ground_truth_processing(from_mappings=True)]

In [None]:
matchGT_

In [None]:
print(matchGT_)

In [None]:
len(matchGT)

In [None]:
#[step 3]Saving Non-GroundTruth Links
word2vec.SaveLinks()

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = LoadLinks(timestamp=1596416340.728148, params=parameters)
df_nonglinks.head()

In [None]:
#[step 4]GroundTruthMatching Testing
path_to_ground_truth = '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt'
word2vec.MatchWithGroundTruth(path_to_ground_truth, semeru_format=True)
word2vec.df_ground_link

In [None]:
#[step 4.1]GroundTruthMatching Testing For CISCO Mappings
word2vec.MatchWithGroundTruth(from_mappings=True)
word2vec.df_ground_link

In [None]:
df_z = word2vec.df_ground_link
df_z[~df_z.isin([np.nan, np.inf, -np.inf]).any(1)]

In [None]:
#debug
df_y = word2vec.df_ground_link.copy()
df_y

In [None]:
#debug
df_y.update(new_column)

In [None]:
new_column

In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1]

In [None]:
word2vec.df_ground_link[word2vec.df_ground_link['Linked?'] == 1].shape #Positive Links

In [None]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links[141] # A tuple

In [None]:
#Inspecting Source
ground_links[141][0][:ground_links[141][0].find('.')] + '-'

In [None]:
#Inspecting Target
ground_links[141][1][:ground_links[141][1].find('.')]

In [None]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = LoadLinks(timestamp=1596426181.318831, params=parameters,grtruth = True)
df_glinks.head()

# Software Traceability with Artifacts Representation 
We are employing two techniques for analyzing software artifacts without groundtruth:
- Prototypes and Criticisms for Paragraph Vectors 
- Information Theory for Software Traceability (Shared Information and Mutual Information)

# Approach Evaluation and Interpretation (word2vec)
Classification/evaluation metrics for highly imbalanced data [(see Forum)](https://stats.stackexchange.com/questions/222558/classification-evaluation-metrics-for-highly-imbalanced-data).

In [None]:
#export
class VectorEvaluation():
    '''Approaches Common Evaluations and Interpretations (statistical analysis)'''
    def __init__(self, sequenceVectorization):
        self.seqVect = sequenceVectorization

In [None]:
%matplotlib inline

In [None]:
#export
class SupervisedVectorEvaluation(VectorEvaluation):
    def __init__(self, sequenceVectorization, sim_list):
        super().__init__(sequenceVectorization)
        self.sim_list = sim_list
        
        self.df_filtered = sequenceVectorization.df_ground_link 
        self.df_filtered = self.df_filtered[~self.df_filtered.isin([np.nan, np.inf, -np.inf]).any(1)]
        
        #CreateFilters Here
        
        self.y_test = self.df_filtered['Linked?'].values
        self.y_score = [self.df_filtered[sim].values for sim in sim_list]
        self.title = str(sequenceVectorization.params['vectorizationType'])
        pass
    
    def Compute_precision_recall_gain(self):
        '''One might choose PRG if there is little interest in identifying false negatives '''
        for count,sim in enumerate(self.sim_list):
            prg_curve = prg.create_prg_curve(self.y_test, self.y_score[count])
            auprg = prg.calc_auprg(prg_curve)
            prg.plot_prg(prg_curve)
            logging.info('auprg:  %.3f' %  auprg)
            logging.info("compute_precision_recall_gain Complete: "+str(sim))
        pass
    
    def Compute_avg_precision(self):
        '''Generated precision-recall curve'''
        
        # calculate the no skill line as the proportion of the positive class
        no_skill = len(self.y_test[self.y_test==1]) / len(self.y_test)
        
        for count,sim in enumerate(self.sim_list):
            plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') #reference curve
            precision, recall, _ = precision_recall_curve(self.y_test, self.y_score[count]) #compute precision-recall curve
            plt.plot(recall, precision, marker='.', label = str(sim)) #plot model curve
            plt.title(self.label[count])
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.legend() #show the legend
            plt.show() #show the plot

            average_precision = average_precision_score(self.y_test, self.y_score[count])
            auc_score = auc(recall, precision)
            logging.info('Average precision-recall score: {0:0.2f}'.format(average_precision))
            logging.info('Precision-Recall AUC: %.3f' % auc_score)
        pass
    
    def Compute_avg_precision_same_plot(self):
        '''Generated precision-recall curve'''

        # calculate the no skill line as the proportion of the positive class
        no_skill = len(self.y_test[self.y_test==1]) / len(self.y_test)
        plt.plot([0, 1], [no_skill, no_skill], linewidth=0.5, linestyle='--', label='No Skill [{0:0.2f}]'.format(no_skill)) #reference curve
        
        for count,sim in enumerate(self.sim_list):
            precision, recall, _ = precision_recall_curve(self.y_test, self.y_score[count]) #compute precision-recall curve
            average_precision = average_precision_score(self.y_test, self.y_score[count])
            auc_score = auc(recall, precision)
            logging.info('Average precision-recall score: {0:0.2f}'.format(average_precision))
            logging.info('Precision-Recall AUC: %.2f' % auc_score)
            
            #plt.plot(recall, precision, linewidth=0.4, marker='.', label = str(sim)) #plot model curve
            plt.plot(recall, precision, linewidth=1, label = str(sim)+  ' [auc:{0:0.2f}]'.format(auc_score)) #plot model curve
            pass
        
        plt.title(self.title)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.legend(fontsize=9) #show the legend
        plt.show() #show the plot
        pass
    
    def Compute_roc_curve(self):

        plt.plot([0, 1], [0, 1],  linewidth=0.5, linestyle='--', label='No Skill') #reference curve
        for count,sim in enumerate(self.sim_list):
            fpr, tpr, _ = roc_curve(self.y_test, self.y_score[count]) #compute roc curve
            roc_auc = roc_auc_score(self.y_test, self.y_score[count])
            logging.info('ROC AUC %.2f' % roc_auc)
            
            plt.plot(fpr, tpr,  linewidth=1, label = str(sim)+  ' [auc:{0:0.2f}]'.format(roc_auc)) #plot model curve
            pass
        plt.title(self.title)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(fontsize=9) #show the legend
        plt.show() #show the plot

        pass

In [None]:
similarities = [SimilarityMetric.SCM_sim, SimilarityMetric.WMD_sim]
supevisedEval = SupervisedVectorEvaluation(word2vec, sim_list = similarities ) #<---- Parameter 
#supevisedEval = SupervisedVectorEvaluation(word2vec, similarity=SimilarityMetric.WMD_sim)

In [None]:
supevisedEval.y_test

In [None]:
supevisedEval.y_score

## Confusion Matrix

In [None]:
##TODO Move the confusion matrix to SupervisedVectorEvaluation
y_score_threshold = [0 if elem<=0.8 else 1 for elem in supevisedEval.y_score] #Hardcoded 0.7 Threshold

In [None]:
#TODO a Variation threshold analysis
tn, fp, fn, tp = confusion_matrix(supevisedEval.y_test, y_score_threshold).ravel()

In [None]:
(tn, fp, fn, tp)

## Precision-Racall-Gain
Based on the library here: [link](https://github.com/meeliskull/prg/tree/master/Python_package). 
The area under traditional PR curves can easily favour models with lower expected F1 score than others, and so the use of Precision-Recall-Gain curves will result in better model selection [(Flach & Kull, 2015)](http://people.cs.bris.ac.uk/~flach//PRGcurves/).
One might choose PRG if there is little interest in identifying false negatives [(from Blog)](https://medium.com/@alexabate/i-did-something-boring-so-you-dont-have-to-9140ca46c84d).

In [None]:
supevisedEval.Compute_precision_recall_gain()

## Compute the average precision score¶
Precision is a metric that quantifies the number of correct positive predictions made.

Recall is a metric that quantifies the number of correct positive predictions made out of all positive predictions that could have been made.

In [None]:
supevisedEval.Compute_avg_precision_same_plot()

## Compute ROC Curve
An ROC curve (or receiver operating characteristic curve) is a plot that summarizes the performance of a binary classification model on the positive class [(see Blog)](https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-imbalanced-classification/).

Use ROC when both classes detection is equally important — When we want to give equal weight to both classes prediction ability we should look at the ROC curve [link](https://towardsdatascience.com/what-metrics-should-we-use-on-imbalanced-data-set-precision-recall-roc-e2e79252aeba).

In [None]:
supevisedEval.Compute_roc_curve()

## Compute distribution of similarities word2vec

In [None]:
#Basic Statistics
filter_metrics = supevisedEval.df_filtered #word2vec.df_ground_link
filter_metrics.describe()

In [None]:
filter_metrics.shape

In [None]:
scatter_matrix(filter_metrics, alpha=0.2, figsize=(12, 12), diagonal='kde')

Lag plots are used to check if a data set or time series is random. Random data should not exhibit any structure in the lag plot. Non-random structure implies that the underlying data are not random. The lag argument may be passed, and when lag=1 the plot is essentially data[:-1] vs. data[1:].

In [None]:
lag_plot(filter_metrics[[SimilarityMetric.WMD_sim]])

In [None]:
lag_plot(filter_metrics[DistanceMetric.WMD])

In [None]:
# calculate model precision-recall curve
sim = np.array(filter_metrics[SimilarityMetric.SCM_sim]) #SimilarityMetric.SCM_sim #SimilarityMetric.WMD_sim

In [None]:
filter_metrics.hist(column=[SimilarityMetric.WMD_sim,DistanceMetric.WMD,SimilarityMetric.SCM_sim,
                            DistanceMetric.SCM],color='k',bins=50,figsize=[10,5],alpha=0.5)

In [None]:
errors = filter_metrics[[SimilarityMetric.WMD_sim,DistanceMetric.WMD,SimilarityMetric.SCM_sim,
                            DistanceMetric.SCM]].std()
print(errors)
filter_metrics[[SimilarityMetric.WMD_sim,DistanceMetric.WMD,SimilarityMetric.SCM_sim,
                            DistanceMetric.SCM]].plot.kde()

In [None]:
filter_metrics[SimilarityMetric.WMD_sim].plot.kde()
filter_metrics[SimilarityMetric.WMD_sim].plot.hist(density=True) # Histogram will now be normalized

In [None]:
filter_metrics[SimilarityMetric.SCM_sim].plot.kde()
filter_metrics[SimilarityMetric.SCM_sim].plot.hist(density=True) # Histogram will now be normalized

In [None]:
filter_metrics[DistanceMetric.WMD].plot.kde()
filter_metrics[DistanceMetric.WMD].plot.hist(density=True)

In [None]:
filter_metrics[DistanceMetric.SCM].plot.kde()
filter_metrics[DistanceMetric.SCM].plot.hist(density=True)

In [None]:
filter_metrics.hist(by='Linked?',column=SimilarityMetric.WMD_sim ,figsize=[10, 5],bins=80)

In [None]:
filter_metrics.hist(by='Linked?',column=SimilarityMetric.SCM_sim ,figsize=[10, 5],bins=80)

In [None]:
filter_metrics.hist(by='Linked?',column=DistanceMetric.WMD,figsize=[10, 5],bins=80)

In [None]:
filter_metrics.hist(by='Linked?',column=DistanceMetric.SCM,figsize=[10, 5],bins=80)

In [None]:
boxplot = filter_metrics.boxplot(by='Linked?',column=[SimilarityMetric.WMD_sim,DistanceMetric.WMD,SimilarityMetric.SCM_sim,
                            DistanceMetric.SCM],figsize=[7, 7])

## Entropy Plots

In [None]:
filter_metrics_01 = filter_metrics.copy()
filter_metrics_01.dropna(inplace=True)

In [None]:
filter_metrics_01[EntropyMetric.MSI_I]

In [None]:
def compute_spearman_corr(filter_metrics_01, columns = [EntropyMetric.MSI_I,SimilarityMetric.SCM_sim] ):
    df_correlation = filter_metrics_01.copy() 
    correlation = df_correlation[columns].corr(method='spearman')
    #correlation = df_correlation.corr(method='spearman')
    return correlation[columns[0]].values[1]

In [None]:
# Minimum Shared Entropy and Word Distance
x1 = filter_metrics_01.plot.scatter(
    x=EntropyMetric.MSI_I,
    y=SimilarityMetric.WMD_sim, 
    c='DarkBlue', 
    s=1,
    title = 'SCM-Entropy Correlation {%.2f}' % compute_spearman_corr(filter_metrics_01)
)

In [None]:
x1 = filter_metrics_01.plot.scatter(
    x=EntropyMetric.MSI_X,
    y=SimilarityMetric.WMD_sim, 
    c='DarkBlue', 
    s=1,
    title = 'SCM-Extropy Correlation {%.2f}' % compute_spearman_corr(filter_metrics_01,[EntropyMetric.MSI_X,SimilarityMetric.SCM_sim] )
)

In [None]:
filter_metrics_linked = filter_metrics_01[filter_metrics_01['Linked?'] == 1].copy()
filter_metrics_nonlinked = filter_metrics_01[filter_metrics_01['Linked?'] == 0].copy()


In [None]:
x2 = filter_metrics_01[filter_metrics_01['Linked?'] == 1].plot.scatter(
    x=EntropyMetric.MSI_I,
    y=SimilarityMetric.SCM_sim, 
    c='Red',
    s=1,
    title = 'Liked SCM-Entropy Correlation {%.2f}' % compute_spearman_corr(filter_metrics_linked)
)
#x2.text(0,0,'test')

In [None]:
x2_ = filter_metrics_nonlinked.plot.scatter(
    x=EntropyMetric.MSI_I,
    y=SimilarityMetric.SCM_sim, 
    c='DarkBlue',
    s=1,
    title = 'non-Linked SCM-Entropy Correlation {%.2f}' % compute_spearman_corr(filter_metrics_nonlinked)
)

In [None]:
#Information levels vs semantics
fig, ax = plt.subplots()
filter_metrics_01.plot.scatter(
    x = EntropyMetric.MSI_I,
    y = EntropyMetric.MSI_X,
    c = SimilarityMetric.SCM_sim,
    #figsize = [12, 6],
    title = 'Information-Semantic Interactions SCM',
    colormap = 'viridis',
    ax = ax,
    s=1
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
#Separated by ground truth Links!
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01['Linked?'] == 1].plot.scatter(
    x = EntropyMetric.MSI_I,
    y = EntropyMetric.MSI_X,
    c = SimilarityMetric.SCM_sim,
    #figsize = [12, 6],
    title = 'Information-Semantic Interactions SCM Linked',
    colormap = 'viridis',
    ax = ax,
    s=1
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
#Separated by ground truth NonLinked!
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01['Linked?'] == 0].plot.scatter(
    x = EntropyMetric.MSI_I,
    y = EntropyMetric.MSI_X,
    c = SimilarityMetric.SCM_sim,
    #figsize = [6, 5],
    title = 'Information-Semantic Interactions SCM non-Linked',
    colormap = 'viridis',
    ax = ax,
    s=1
)

ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
ax7 = filter_metrics_01.plot.scatter(
    x = EntropyMetric.MSI_X,
    y = EntropyMetric.MSI_I,
    c = SimilarityMetric.SCM_sim,
    #figsize = [12, 6],
    title = 'Information-Semantic Interactions SCM',
    colormap = 'viridis',
    s=1
)
ax7.set_xlabel("Minimum Shared Extropy")
ax7.set_ylabel("Minimum Shared Entropy")

In [None]:
fig, ax = plt.subplots()
filter_metrics_01.plot.scatter(
    x = EntropyMetric.MSI_I,
    y = EntropyMetric.MSI_X,
    c = SimilarityMetric.WMD_sim,
    #figsize = [12, 6],
    title = 'Information-Semantic Interactions WMD',
    colormap = 'viridis',
    ax = ax
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01['Linked?'] == 1].plot.scatter(
    x = EntropyMetric.MSI_I,
    y = EntropyMetric.MSI_X,
    c = SimilarityMetric.WMD_sim,
    #figsize = [12, 6],
    title = 'Information-Semantic Interactions WMD Linked',
    colormap = 'viridis',
    ax = ax
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01['Linked?'] == 0].plot.scatter(
    x = EntropyMetric.MSI_I,
    y = EntropyMetric.MSI_X,
    c = SimilarityMetric.WMD_sim,
    #figsize = [12, 6],
    title = 'Information-Semantic Interactions WMD non-Linked',
    colormap = 'viridis',
    ax = ax
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
filter_metrics.head()

# Artifacts Similarity with Doc2Vec

Try to reproduce the same empirical evaluation like here: [link](https://arxiv.org/pdf/1507.07998.pdf). Pay attention to:
- Accuracy vs. Dimensionality (we can replace accuracy for false positive rate or true positive rate)
- Visualize paragraph vectors using t-sne
- Computing Cosine Distance and Similarity. More about similarity [link](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html)

In [None]:
#path_to_trained_model": 'test_data/models/pv/conv/[doc2vec-Py-Java-PVDBOW-500-20E-1592609630.689167].model',
#"path_to_trained_model": 'test_data/models/pv/conv/[doc2vec-Py-Java-Wiki-PVDBOW-500-20E[15]-1592941134.367976].model',
path_to_trained_model = 'test_data/models/[doc2vec-Py-Java-PVDBOW-500-20E-8k-1594572857.17191].model'

In [None]:
def doc2vec_params():
    return {
        "vectorizationType": VectorizationType.doc2vec,
        "linkType": LinkType.req2tc,
        "system": 'libest',
        "path_to_trained_model": path_to_trained_model,
        "source_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-req].csv',
        "target_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-tc].csv',
        "system_path": '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv',
        "saving_path": 'test_data/',
        "names": ['Source','Target','Linked?']
    }

In [None]:
doc2vec_params = doc2vec_params()
doc2vec_params

In [None]:
#Export
class Doc2VecSeqVect(BasicSequenceVectorization):
    
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Doc2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        self.df_inferred_src = None
        self.df_inferred_trg = None
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.EUC: self.euclidean_scipy,
            DistanceMetric.MAN: self.manhattan_scipy
        }
    
    def distance(self, metric_list, link):
        '''Iterate on the metrics'''
        ν_inferredSource = list(self.df_inferred_src[self.df_inferred_src['ids'].str.contains(link[0])]['inf-doc2vec'])
        w_inferredTarget = list(self.df_inferred_trg[self.df_inferred_trg['ids'].str.contains(link[1])]['inf-doc2vec'])
        
        dist = [ self.dict_distance_dispatcher[metric](ν_inferredSource,w_inferredTarget) for metric in metric_list]
        logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    def computeDistanceMetric(self, links, metric_list):
        '''It is computed the cosine similarity'''
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)

    
    def InferDoc2Vec(self, steps=200):
        '''Activate Inference on Target and Source Corpus'''
        self.df_inferred_src = self.df_source.copy()
        self.df_inferred_trg = self.df_target.copy()
        
        self.df_inferred_src['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_src['text'].values]
        self.df_inferred_trg['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_trg['text'].values]
        
        logging.info("Infer Doc2Vec on Source and Target Complete")
    

### Testing Doc2Vec SequenceVectorization

In [None]:
doc2vec = Doc2VecSeqVect(params = doc2vec_params)

In [None]:
#[step1]Apply Doc2Vec Inference
doc2vec.InferDoc2Vec(steps=200)

In [None]:
doc2vec.df_inferred_src.head(2)

In [None]:
#test_inferDoc2Vec_trg = inferDoc2Vec(df_target)
#test_inferDoc2Vec_trg.head()
doc2vec.df_inferred_trg.head(2)

In [None]:
pearsonr(doc2vec.df_inferred_trg['inf-doc2vec'][0], doc2vec.df_inferred_trg['inf-doc2vec'][0])

In [None]:
#[step 2]NonGroundTruth Computation
metric_l = [DistanceMetric.EUC,DistanceMetric.COS,DistanceMetric.MAN]# , SimilarityMetric.Pearson]
doc2vec.ComputeDistanceArtifacts( sampling=False, samples = 50, metric_list = metric_l )
doc2vec.df_nonground_link.head()

In [None]:
#[step 3]Saving Non-GroundTruth Links
doc2vec.SaveLinks()

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks_doc2vec = LoadLinks(timestamp=1594653325.258415, params=doc2vec_params)
df_nonglinks_doc2vec.head()

In [None]:
#[step 4]GroundTruthMatching Testing
path_to_ground_truth = '/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt'
doc2vec.MatchWithGroundTruth(path_to_ground_truth)
doc2vec.df_ground_link

In [None]:
#[step 5]Saving GroundTruth Links
doc2vec.SaveLinks(grtruth = True)

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks_doc2vec = LoadLinks(timestamp=1594653350.19946, params=doc2vec_params, grtruth = True)
df_glinks_doc2vec.head()

# Approach Evaluation and Interpretation (doc2vec)

In [None]:
#supervisedEvalDoc2vec = SupervisedVectorEvaluation(doc2vec, similarity=SimilarityMetric.EUC_sim)
#supervisedEvalDoc2vec = SupervisedVectorEvaluation(doc2vec, similarity=SimilarityMetric.COS_sim)
supervisedEvalDoc2vec = SupervisedVectorEvaluation(doc2vec, similarity=SimilarityMetric.MAN_sim)

In [None]:
supervisedEvalDoc2vec.y_test

In [None]:
supervisedEvalDoc2vec.y_score

In [None]:
supervisedEvalDoc2vec.Compute_precision_recall_gain()

In [None]:
supervisedEvalDoc2vec.Compute_avg_precision()

In [None]:
supervisedEvalDoc2vec.Compute_roc_curve()

## Compute distribution of similarities doc2vec

In [None]:
#Basic Statistics
filter_doc2vec = doc2vec.df_ground_link
filter_doc2vec.describe()

In [None]:
lag_plot(filter_doc2vec[[SimilarityMetric.EUC_sim]])

In [None]:
lag_plot(filter_doc2vec[DistanceMetric.EUC])

In [None]:
filter_doc2vec.hist(column=[SimilarityMetric.EUC_sim,DistanceMetric.EUC],color='k',bins=50,figsize=[10,5],alpha=0.5)

In [None]:
#Separate distance from similarity analysis here
errors = filter_doc2vec[[SimilarityMetric.EUC_sim,DistanceMetric.EUC]].std()
print(errors)
filter_doc2vec[[SimilarityMetric.EUC_sim,DistanceMetric.EUC]].plot.kde()

In [None]:
filter_doc2vec.hist(by='Linked?',column=SimilarityMetric.EUC_sim,figsize=[10, 5],bins=80)

In [None]:
filter_doc2vec.hist(by='Linked?',column=DistanceMetric.EUC,figsize=[10, 5],bins=80)

In [None]:
#separate the distance from the similarity plot
boxplot = filter_doc2vec.boxplot(by='Linked?',column=[SimilarityMetric.EUC_sim,DistanceMetric.EUC],figsize=[10, 5])

In [None]:
boxplot = filter_doc2vec.boxplot(by='Linked?',column=[SimilarityMetric.EUC_sim],figsize=[10, 5])

## Combining Doc2vec and Word2vec
Please check this post for futher detatils [link](https://stats.stackexchange.com/questions/217614/intepreting-doc2vec-cosine-similarity-between-doc-vectors-and-word-vectors)

In [None]:
! nbdev_build_docs #<-------- [Activate when stable]

In [None]:
! nbdev_build_lib

In [None]:
from nbdev.export import notebook2script
notebook2script()

In [None]:
#! pip install -e .