In [None]:
# default_exp experiment.mining.ir.unsupervised.w2v

# Experimenting Neural Unsupervised Approaches for Software Information Retrieval [w2v]

> Just Paper. Full Experimentation. This module is dedicated to experiment with word2vec. Consider to Copy the entire notebook for a new and separeted empirical evaluation. 
> Implementing mutual information analysis
> Author: @danaderp April 2020
> Author: @danielrc Nov 2020

This copy is for Cisco purposes. It was adapted to process private github data from cisco. 

In [None]:
from ds4se.mining.ir import *

In [None]:
from prg import prg

In [None]:
import ds4se as ds

In [None]:
import numpy as np

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Artifacts Similarity with BasicSequenceVectorization

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

## Experients Set-up

In [None]:
path_data = '../dvc-ds4se/' #dataset path

In [None]:
#Experiments 1.1.2 <<-- word2vec
path_model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_32k'
path_to_trained_model = path_data+'/models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model'
def sacp_params():
        return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.issue2src,
        "system": 'sacp-python-common',
        "path_to_trained_model": path_to_trained_model,
        "source_type": SoftwareArtifacts.PR.value,
        "target_type": SoftwareArtifacts.PY.value,
        "system_path_config": {
            "system_path": '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
            "sep": '~',
            "names": ['ids','bpe32k'],
            "prep": Preprocessing.bpe
        },
        "path_mappings": "/tf/data/cisco/sacp_data/sacp-pr-mappings.csv",
        "saving_path": path_data + 'metrics/traceability/experiments1.1.x/',
        "names": ['Source','Target','Linked?'],
        "model_prefix": path_model_prefix

        }

In [None]:
parameters = sacp_params()
parameters

{'vectorizationType': <VectorizationType.word2vec: 1>,
 'linkType': <LinkType.issue2src: 3>,
 'system': 'sacp-python-common',
 'path_to_trained_model': '../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model',
 'source_type': 'pr',
 'target_type': 'py',
 'system_path_config': {'system_path': '/tf/data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv',
  'sep': '~',
  'names': ['ids', 'bpe32k'],
  'prep': <Preprocessing.bpe: 2>},
 'path_mappings': '/tf/data/cisco/sacp_data/sacp-pr-mappings.csv',
 'saving_path': '../dvc-ds4se/metrics/traceability/experiments1.1.x/',
 'names': ['Source', 'Target', 'Linked?'],
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_32k'}

# Artifacts Similarity with Word2Vec

In [None]:
#[step 1]Creating the Vectorization Class
word2vec = ds.mining.ir.Word2VecSeqVect( params = parameters, logging = logging )

2021-01-26 03:00:08,973 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-01-26 03:00:09,051 : INFO : built Dictionary(3010 unique tokens: ['28', '29', '3', '4)', '7']...) from 362 documents (total 171602 corpus positions)
2021-01-26 03:00:09,290 : INFO : Ignored vocab by BPE{'γ', '@', '```', '^', '`', '\r\n\r\n@', '\\', '\r\n\r\n', '\t', '\r\n', '\r\n\r\n\r\n'}
2021-01-26 03:00:09,292 : INFO : bpe preprocessing documents, dictionary, and vocab for the test corpus
2021-01-26 03:00:09,294 : INFO : loading Word2Vec object from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model
2021-01-26 03:00:10,536 : INFO : loading wv recursively from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py-Java-SK-500-20E-32k-1593748814.350487].model.wv.* with mmap=None
2021-01-26 03:00:10,539 : INFO : setting ignored attribute vectors_norm to None
2021-01-26 03:00:10,542 : INFO : loading vocabulary recursively from ../dvc-ds4se//models/wv/bpe32k/[word2vec-Py

In [None]:
#[step 2]NonGroundTruth Computation
metric_list = [DistanceMetric.WMD,DistanceMetric.SCM,EntropyMetric.MSI_I,EntropyMetric.MI]
#metric_list = [EntropyMetric.MSI_I,EntropyMetric.MI]
word2vec.ComputeDistanceArtifacts( sampling=False, samples = 100, metric_list = metric_list )
word2vec.df_nonground_link.head()

In [None]:
word2vec.df_nonground_link.head()

In [None]:
word2vec.df_nonground_link['Target'][1]

In [None]:
#[step 3]Saving Non-GroundTruth Links
word2vec.SaveLinks()

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = ds.mining.ir.LoadLinks(timestamp=1610579170.341825, params=parameters, logging=logging)
df_nonglinks.head()

In [None]:
word2vec.df_nonground_link = df_nonglinks # Only to load links from file

In [None]:
#[step 4]GroundTruthMatching Testing
#TODO change the path for a param
path_to_ground_truth =  parameters['path_mappings']
word2vec.MatchWithGroundTruth(path_to_ground_truth, semeru_format=True)
word2vec.df_ground_link

In [None]:
word2vec.df_ground_link[word2vec.df_ground_link ['Linked?']==1]

In [None]:
word2vec.df_ground_link['Source'][0]

## 4.1 Only SACP

In [None]:
#[step 4.1]GroundTruthMatching Testing For CISCO Mappings
word2vec.MatchWithGroundTruth(from_mappings=True)
word2vec.df_ground_link

In [None]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links # A tuple

In [None]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = ds.mining.ir.LoadLinks(timestamp=1610579318.97542, params=parameters,grtruth = True, logging=logging)
df_glinks.head()

### Generating Documentation

In [None]:
! nbdev_build_docs #<-------- [Activate when stable]

In [None]:
! nbdev_build_lib

In [None]:
from nbdev.export import notebook2script
notebook2script()

In [None]:
#! pip install -e .