In [None]:
# default_exp example_mining.unsupervised.traceability.approach.cisco

# Neural Unsupervised Approaches for SE Traceability [approach]

> This module is dedicated to evaluate word2vec/doc2vec or any neural unsupervised approaches on traceability datasets. Consider to Copy the entire notebook for a new and separeted empirical evaluation. 
>
> Author: @danaderp April 2020

This copy is for Cisco purposes. It was adapted to process private github data from cisco. 

In [None]:
# TODO
# http://www.ashukumar27.io/similarity_functions/
# https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html
# https://towardsdatascience.com/importance-of-distance-metrics-in-machine-learning-modelling-e51395ffe60d
# https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html

In [None]:
# hide
#! pip install seaborn
#! pip install sklearn
#!pip install pyprg
!pip install pyemd

Collecting pyemd
  Downloading pyemd-0.5.1.tar.gz (91 kB)
[K     |████████████████████████████████| 91 kB 1.6 MB/s eta 0:00:01
Building wheels for collected packages: pyemd
  Building wheel for pyemd (setup.py) ... [?25ldone
[?25h  Created wheel for pyemd: filename=pyemd-0.5.1-cp38-cp38-macosx_10_9_x86_64.whl size=76416 sha256=8e91b72bf1e97b88088d650e1417344a58bbfb4969069a333ee82aec2d737d60
  Stored in directory: /Users/robertfrigerio/Library/Caches/pip/wheels/a2/a5/34/f960a47ca5c06b0e91b6f48117a79a66f53a879f8fac9529bf
Successfully built pyemd
Installing collected packages: pyemd
Successfully installed pyemd-0.5.1


In [None]:
# export
# Imports
import numpy as np
import gensim
import pandas as pd
from itertools import product
from random import sample
import functools
import os
from enum import Enum, unique, auto

In [None]:
# export
from datetime import datetime
import seaborn as sns

In [None]:
# export
import logging

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

In [None]:
# export
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from prg import prg
from pandas.plotting import scatter_matrix
from pandas.plotting import lag_plot
import math as m
import random as r
import collections
from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'prg'

In [None]:
# export
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora

In [None]:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
# export
from scipy.spatial import distance
from scipy.stats import pearsonr

In [None]:
# export
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
import ds4se as ds

## Confusion Matrix

In [None]:
##TODO Move the confusion matrix to SupervisedVectorEvaluation
y_score_threshold = [
    0 if elem <= 0.8 else 1 for elem in supevisedEval.y_score
]  # Hardcoded 0.7 Threshold

In [None]:
# TODO a Variation threshold analysis
tn, fp, fn, tp = confusion_matrix(supevisedEval.y_test, y_score_threshold).ravel()

In [None]:
(tn, fp, fn, tp)

## Precision-Racall-Gain
Based on the library here: [link](https://github.com/meeliskull/prg/tree/master/Python_package). 
The area under traditional PR curves can easily favour models with lower expected F1 score than others, and so the use of Precision-Recall-Gain curves will result in better model selection [(Flach & Kull, 2015)](http://people.cs.bris.ac.uk/~flach//PRGcurves/).
One might choose PRG if there is little interest in identifying false negatives [(from Blog)](https://medium.com/@alexabate/i-did-something-boring-so-you-dont-have-to-9140ca46c84d).

In [None]:
supevisedEval.Compute_precision_recall_gain()

## Compute the average precision score¶
Precision is a metric that quantifies the number of correct positive predictions made.

Recall is a metric that quantifies the number of correct positive predictions made out of all positive predictions that could have been made.

In [None]:
supevisedEval.Compute_avg_precision_same_plot()

## Compute ROC Curve
An ROC curve (or receiver operating characteristic curve) is a plot that summarizes the performance of a binary classification model on the positive class [(see Blog)](https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-imbalanced-classification/).

Use ROC when both classes detection is equally important — When we want to give equal weight to both classes prediction ability we should look at the ROC curve [link](https://towardsdatascience.com/what-metrics-should-we-use-on-imbalanced-data-set-precision-recall-roc-e2e79252aeba).

In [None]:
supevisedEval.Compute_roc_curve()

## Compute distribution of similarities word2vec

In [None]:
# Basic Statistics
filter_metrics = supevisedEval.df_filtered  # word2vec.df_ground_link
filter_metrics.describe()

In [None]:
filter_metrics.shape

In [None]:
scatter_matrix(filter_metrics, alpha=0.2, figsize=(12, 12), diagonal="kde")

Lag plots are used to check if a data set or time series is random. Random data should not exhibit any structure in the lag plot. Non-random structure implies that the underlying data are not random. The lag argument may be passed, and when lag=1 the plot is essentially data[:-1] vs. data[1:].

In [None]:
lag_plot(filter_metrics[[SimilarityMetric.WMD_sim]])

In [None]:
lag_plot(filter_metrics[DistanceMetric.WMD])

In [None]:
# calculate model precision-recall curve
sim = np.array(
    filter_metrics[SimilarityMetric.SCM_sim]
)  # SimilarityMetric.SCM_sim #SimilarityMetric.WMD_sim

In [None]:
filter_metrics.hist(
    column=[
        SimilarityMetric.WMD_sim,
        DistanceMetric.WMD,
        SimilarityMetric.SCM_sim,
        DistanceMetric.SCM,
    ],
    color="k",
    bins=50,
    figsize=[10, 5],
    alpha=0.5,
)

In [None]:
errors = filter_metrics[
    [
        SimilarityMetric.WMD_sim,
        DistanceMetric.WMD,
        SimilarityMetric.SCM_sim,
        DistanceMetric.SCM,
    ]
].std()
print(errors)
filter_metrics[
    [
        SimilarityMetric.WMD_sim,
        DistanceMetric.WMD,
        SimilarityMetric.SCM_sim,
        DistanceMetric.SCM,
    ]
].plot.kde()

In [None]:
filter_metrics[SimilarityMetric.WMD_sim].plot.kde()
filter_metrics[SimilarityMetric.WMD_sim].plot.hist(
    density=True
)  # Histogram will now be normalized

In [None]:
filter_metrics[SimilarityMetric.SCM_sim].plot.kde()
filter_metrics[SimilarityMetric.SCM_sim].plot.hist(
    density=True
)  # Histogram will now be normalized

In [None]:
filter_metrics[DistanceMetric.WMD].plot.kde()
filter_metrics[DistanceMetric.WMD].plot.hist(density=True)

In [None]:
filter_metrics[DistanceMetric.SCM].plot.kde()
filter_metrics[DistanceMetric.SCM].plot.hist(density=True)

In [None]:
filter_metrics.hist(
    by="Linked?", column=SimilarityMetric.WMD_sim, figsize=[10, 5], bins=80
)

In [None]:
filter_metrics.hist(
    by="Linked?", column=SimilarityMetric.SCM_sim, figsize=[10, 5], bins=80
)

In [None]:
filter_metrics.hist(by="Linked?", column=DistanceMetric.WMD, figsize=[10, 5], bins=80)

In [None]:
filter_metrics.hist(by="Linked?", column=DistanceMetric.SCM, figsize=[10, 5], bins=80)

In [None]:
boxplot = filter_metrics.boxplot(
    by="Linked?",
    column=[
        SimilarityMetric.WMD_sim,
        DistanceMetric.WMD,
        SimilarityMetric.SCM_sim,
        DistanceMetric.SCM,
    ],
    figsize=[7, 7],
)

## Entropy Plots

In [None]:
filter_metrics_01 = filter_metrics.copy()
filter_metrics_01.dropna(inplace=True)

In [None]:
filter_metrics_01[EntropyMetric.MSI_I]

In [None]:
def compute_spearman_corr(
    filter_metrics_01, columns=[EntropyMetric.MSI_I, SimilarityMetric.SCM_sim]
):
    df_correlation = filter_metrics_01.copy()
    correlation = df_correlation[columns].corr(method="spearman")
    # correlation = df_correlation.corr(method='spearman')
    return correlation[columns[0]].values[1]

In [None]:
# Minimum Shared Entropy and Word Distance
x1 = filter_metrics_01.plot.scatter(
    x=EntropyMetric.MSI_I,
    y=SimilarityMetric.WMD_sim,
    c="DarkBlue",
    s=1,
    title="SCM-Entropy Correlation {%.2f}" % compute_spearman_corr(filter_metrics_01),
)

In [None]:
x1 = filter_metrics_01.plot.scatter(
    x=EntropyMetric.MSI_X,
    y=SimilarityMetric.WMD_sim,
    c="DarkBlue",
    s=1,
    title="SCM-Extropy Correlation {%.2f}"
    % compute_spearman_corr(
        filter_metrics_01, [EntropyMetric.MSI_X, SimilarityMetric.SCM_sim]
    ),
)

In [None]:
filter_metrics_linked = filter_metrics_01[filter_metrics_01["Linked?"] == 1].copy()
filter_metrics_nonlinked = filter_metrics_01[filter_metrics_01["Linked?"] == 0].copy()

In [None]:
x2 = filter_metrics_01[filter_metrics_01["Linked?"] == 1].plot.scatter(
    x=EntropyMetric.MSI_I,
    y=SimilarityMetric.SCM_sim,
    c="Red",
    s=1,
    title="Liked SCM-Entropy Correlation {%.2f}"
    % compute_spearman_corr(filter_metrics_linked),
)
# x2.text(0,0,'test')

In [None]:
x2_ = filter_metrics_nonlinked.plot.scatter(
    x=EntropyMetric.MSI_I,
    y=SimilarityMetric.SCM_sim,
    c="DarkBlue",
    s=1,
    title="non-Linked SCM-Entropy Correlation {%.2f}"
    % compute_spearman_corr(filter_metrics_nonlinked),
)

In [None]:
# Information levels vs semantics
fig, ax = plt.subplots()
filter_metrics_01.plot.scatter(
    x=EntropyMetric.MSI_I,
    y=EntropyMetric.MSI_X,
    c=SimilarityMetric.SCM_sim,
    # figsize = [12, 6],
    title="Information-Semantic Interactions SCM",
    colormap="viridis",
    ax=ax,
    s=1,
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
# Separated by ground truth Links!
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01["Linked?"] == 1].plot.scatter(
    x=EntropyMetric.MSI_I,
    y=EntropyMetric.MSI_X,
    c=SimilarityMetric.SCM_sim,
    # figsize = [12, 6],
    title="Information-Semantic Interactions SCM Linked",
    colormap="viridis",
    ax=ax,
    s=1,
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
# Separated by ground truth NonLinked!
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01["Linked?"] == 0].plot.scatter(
    x=EntropyMetric.MSI_I,
    y=EntropyMetric.MSI_X,
    c=SimilarityMetric.SCM_sim,
    # figsize = [6, 5],
    title="Information-Semantic Interactions SCM non-Linked",
    colormap="viridis",
    ax=ax,
    s=1,
)

ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
ax7 = filter_metrics_01.plot.scatter(
    x=EntropyMetric.MSI_X,
    y=EntropyMetric.MSI_I,
    c=SimilarityMetric.SCM_sim,
    # figsize = [12, 6],
    title="Information-Semantic Interactions SCM",
    colormap="viridis",
    s=1,
)
ax7.set_xlabel("Minimum Shared Extropy")
ax7.set_ylabel("Minimum Shared Entropy")

In [None]:
fig, ax = plt.subplots()
filter_metrics_01.plot.scatter(
    x=EntropyMetric.MSI_I,
    y=EntropyMetric.MSI_X,
    c=SimilarityMetric.WMD_sim,
    # figsize = [12, 6],
    title="Information-Semantic Interactions WMD",
    colormap="viridis",
    ax=ax,
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01["Linked?"] == 1].plot.scatter(
    x=EntropyMetric.MSI_I,
    y=EntropyMetric.MSI_X,
    c=SimilarityMetric.WMD_sim,
    # figsize = [12, 6],
    title="Information-Semantic Interactions WMD Linked",
    colormap="viridis",
    ax=ax,
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
fig, ax = plt.subplots()
filter_metrics_01[filter_metrics_01["Linked?"] == 0].plot.scatter(
    x=EntropyMetric.MSI_I,
    y=EntropyMetric.MSI_X,
    c=SimilarityMetric.WMD_sim,
    # figsize = [12, 6],
    title="Information-Semantic Interactions WMD non-Linked",
    colormap="viridis",
    ax=ax,
)
ax.set_xlabel("Minimum Shared Entropy")
ax.set_ylabel("Minimum Shared Extropy")

In [None]:
filter_metrics.head()

# Artifacts Similarity with Doc2Vec

Try to reproduce the same empirical evaluation like here: [link](https://arxiv.org/pdf/1507.07998.pdf). Pay attention to:
- Accuracy vs. Dimensionality (we can replace accuracy for false positive rate or true positive rate)
- Visualize paragraph vectors using t-sne
- Computing Cosine Distance and Similarity. More about similarity [link](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html)

In [None]:
# path_to_trained_model": 'test_data/models/pv/conv/[doc2vec-Py-Java-PVDBOW-500-20E-1592609630.689167].model',
# "path_to_trained_model": 'test_data/models/pv/conv/[doc2vec-Py-Java-Wiki-PVDBOW-500-20E[15]-1592941134.367976].model',
path_to_trained_model = (
    "test_data/models/[doc2vec-Py-Java-PVDBOW-500-20E-8k-1594572857.17191].model"
)

In [None]:
def doc2vec_params():
    return {
        "vectorizationType": VectorizationType.doc2vec,
        "linkType": LinkType.req2tc,
        "system": "libest",
        "path_to_trained_model": path_to_trained_model,
        "source_path": "/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-req].csv",
        "target_path": "/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-tc].csv",
        "system_path": "/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv",
        "saving_path": "test_data/",
        "names": ["Source", "Target", "Linked?"],
    }

In [None]:
doc2vec_params = doc2vec_params()
doc2vec_params

In [None]:
# Export
class Doc2VecSeqVect(BasicSequenceVectorization):
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Doc2Vec.load(params["path_to_trained_model"])
        self.new_model.init_sims(
            replace=True
        )  # Normalizes the vectors in the word2vec class.
        self.df_inferred_src = None
        self.df_inferred_trg = None

        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.EUC: self.euclidean_scipy,
            DistanceMetric.MAN: self.manhattan_scipy,
        }

    def distance(self, metric_list, link):
        """Iterate on the metrics"""
        ν_inferredSource = list(
            self.df_inferred_src[self.df_inferred_src["ids"].str.contains(link[0])][
                "inf-doc2vec"
            ]
        )
        w_inferredTarget = list(
            self.df_inferred_trg[self.df_inferred_trg["ids"].str.contains(link[1])][
                "inf-doc2vec"
            ]
        )

        dist = [
            self.dict_distance_dispatcher[metric](ν_inferredSource, w_inferredTarget)
            for metric in metric_list
        ]
        logging.info("Computed distances or similarities " + str(link) + str(dist))
        return functools.reduce(lambda a, b: a + b, dist)  # Always return a list

    def computeDistanceMetric(self, links, metric_list):
        """It is computed the cosine similarity"""

        metric_labels = [
            self.dict_labels[metric] for metric in metric_list
        ]  # tracking of the labels
        distSim = [
            [link[0], link[1], self.distance(metric_list, link)] for link in links
        ]  # Return the link with metrics
        distSim = [
            [elem[0], elem[1]] + elem[2] for elem in distSim
        ]  # Return the link with metrics

        return distSim, functools.reduce(lambda a, b: a + b, metric_labels)

    def InferDoc2Vec(self, steps=200):
        """Activate Inference on Target and Source Corpus"""
        self.df_inferred_src = self.df_source.copy()
        self.df_inferred_trg = self.df_target.copy()

        self.df_inferred_src["inf-doc2vec"] = [
            self.new_model.infer_vector(artifact.split(), steps=steps)
            for artifact in self.df_inferred_src["text"].values
        ]
        self.df_inferred_trg["inf-doc2vec"] = [
            self.new_model.infer_vector(artifact.split(), steps=steps)
            for artifact in self.df_inferred_trg["text"].values
        ]

        logging.info("Infer Doc2Vec on Source and Target Complete")

### Testing Doc2Vec SequenceVectorization

In [None]:
doc2vec = Doc2VecSeqVect(params=doc2vec_params)

In [None]:
# [step1]Apply Doc2Vec Inference
doc2vec.InferDoc2Vec(steps=200)

In [None]:
doc2vec.df_inferred_src.head(2)

In [None]:
# test_inferDoc2Vec_trg = inferDoc2Vec(df_target)
# test_inferDoc2Vec_trg.head()
doc2vec.df_inferred_trg.head(2)

In [None]:
pearsonr(
    doc2vec.df_inferred_trg["inf-doc2vec"][0], doc2vec.df_inferred_trg["inf-doc2vec"][0]
)

In [None]:
# [step 2]NonGroundTruth Computation
metric_l = [
    DistanceMetric.EUC,
    DistanceMetric.COS,
    DistanceMetric.MAN,
]  # , SimilarityMetric.Pearson]
doc2vec.ComputeDistanceArtifacts(sampling=False, samples=50, metric_list=metric_l)
doc2vec.df_nonground_link.head()

In [None]:
# [step 3]Saving Non-GroundTruth Links
doc2vec.SaveLinks()

In [None]:
# Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks_doc2vec = LoadLinks(timestamp=1594653325.258415, params=doc2vec_params)
df_nonglinks_doc2vec.head()

In [None]:
# [step 4]GroundTruthMatching Testing
path_to_ground_truth = "/tf/main/benchmarking/traceability/testbeds/groundtruth/english/[libest-ground-req-to-tc].txt"
doc2vec.MatchWithGroundTruth(path_to_ground_truth)
doc2vec.df_ground_link

In [None]:
# [step 5]Saving GroundTruth Links
doc2vec.SaveLinks(grtruth=True)

In [None]:
# Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks_doc2vec = LoadLinks(
    timestamp=1594653350.19946, params=doc2vec_params, grtruth=True
)
df_glinks_doc2vec.head()

# Approach Evaluation and Interpretation (doc2vec)

In [None]:
# supervisedEvalDoc2vec = SupervisedVectorEvaluation(doc2vec, similarity=SimilarityMetric.EUC_sim)
# supervisedEvalDoc2vec = SupervisedVectorEvaluation(doc2vec, similarity=SimilarityMetric.COS_sim)
supervisedEvalDoc2vec = SupervisedVectorEvaluation(
    doc2vec, similarity=SimilarityMetric.MAN_sim
)

In [None]:
supervisedEvalDoc2vec.y_test

In [None]:
supervisedEvalDoc2vec.y_score

In [None]:
supervisedEvalDoc2vec.Compute_precision_recall_gain()

In [None]:
supervisedEvalDoc2vec.Compute_avg_precision()

In [None]:
supervisedEvalDoc2vec.Compute_roc_curve()

## Compute distribution of similarities doc2vec

In [None]:
# Basic Statistics
filter_doc2vec = doc2vec.df_ground_link
filter_doc2vec.describe()

In [None]:
lag_plot(filter_doc2vec[[SimilarityMetric.EUC_sim]])

In [None]:
lag_plot(filter_doc2vec[DistanceMetric.EUC])

In [None]:
filter_doc2vec.hist(
    column=[SimilarityMetric.EUC_sim, DistanceMetric.EUC],
    color="k",
    bins=50,
    figsize=[10, 5],
    alpha=0.5,
)

In [None]:
# Separate distance from similarity analysis here
errors = filter_doc2vec[[SimilarityMetric.EUC_sim, DistanceMetric.EUC]].std()
print(errors)
filter_doc2vec[[SimilarityMetric.EUC_sim, DistanceMetric.EUC]].plot.kde()

In [None]:
filter_doc2vec.hist(
    by="Linked?", column=SimilarityMetric.EUC_sim, figsize=[10, 5], bins=80
)

In [None]:
filter_doc2vec.hist(by="Linked?", column=DistanceMetric.EUC, figsize=[10, 5], bins=80)

In [None]:
# separate the distance from the similarity plot
boxplot = filter_doc2vec.boxplot(
    by="Linked?", column=[SimilarityMetric.EUC_sim, DistanceMetric.EUC], figsize=[10, 5]
)

In [None]:
boxplot = filter_doc2vec.boxplot(
    by="Linked?", column=[SimilarityMetric.EUC_sim], figsize=[10, 5]
)

## Combining Doc2vec and Word2vec
Please check this post for futher detatils [link](https://stats.stackexchange.com/questions/217614/intepreting-doc2vec-cosine-similarity-between-doc-vectors-and-word-vectors)

In [None]:
! nbdev_build_docs #<-------- [Activate when stable]

In [None]:
! nbdev_build_lib

In [None]:
from nbdev.export import notebook2script

notebook2script()

In [None]:
#! pip install -e .

In [None]:
from ds4se.mgmnt.prep.conv import *