In [1]:
import os
import sys
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from tensorflow import keras 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [2]:
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'python-packages/'))
from deep_hashing_models import *
from similarities import *
from lsh_search import *

# 1. Data Preparation

In [3]:
data_repo = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))),'data/')

In [4]:
df = pd.read_csv(data_repo + 'stack_traces.csv', index_col = [0])
df['stackTraceCusto'] = df['stackTraceCusto'].apply(lambda x : x.replace('\r',''))
df['stackTraceCusto'] = df['stackTraceCusto'].apply(lambda x : x.replace('\na','\n'))
df['listStackTrace'] = df['stackTraceCusto'].apply(lambda x : x.replace('\n', ' ').strip().split(' '))

In [5]:
df_distinct_stacks = pd.read_csv(data_repo + 'frequent_stack_traces.csv', index_col = [0])
df_distinct_stacks['listStackTrace'] = df_distinct_stacks['stackTraceCusto'].apply(lambda x : x.replace('\n', ' ').strip().split(' '))

In [6]:
df_measures = pd.read_csv(data_repo + 'similarity-measures-pairs.csv', index_col = [0])

In [7]:
n_stacks = df_distinct_stacks.shape[0]
n_stacks

1000

In [8]:
limit = 100000

In [9]:
corpus = df_distinct_stacks['listStackTrace'].tolist()
frames = pd.Series(list(set([elt for l in corpus for elt in l])))
df_frames = pd.DataFrame()
df_frames['frame'] = pd.get_dummies(frames).T.reset_index().rename(columns={'index': 'frame'})['frame']
df_frames['embedding'] = pd.get_dummies(frames).T.reset_index().apply(lambda x : x[1:].values, axis = 1)
df['rankFrames'] = df['listStackTrace'].apply(lambda x : index_frame(x, df_frames))

# 2. Load deeplsh and baseline models

In [10]:
intermediate_model_deeplsh  = keras.models.load_model('Models/model-deep-lsh.model')
intermediate_model_baseline = keras.models.load_model('Models/model-baseline.model')



# 3. Runtime comparison

## 3.1. Brute force method 

In [15]:
%%time
sim_levensh = df[:limit].apply(lambda x : levenshtein_df(x['listStackTrace'], df_distinct_stacks['listStackTrace'], rowIndex(x)), axis = 1)

CPU times: user 1h 21min 51s, sys: 6.74 s, total: 1h 21min 58s
Wall time: 1h 22min 10s


## 3.2. DeepLSH

In [11]:
with open('Hash-Tables/hash_tables_deeplsh.pkl', 'rb') as f:
    hash_tables_deeplsh = pickle.load(f)

In [12]:
%%time
prediction_deeplsh = intermediate_model_deeplsh.predict(pad_sequences(df[:limit]['rankFrames'], padding = 'post', truncating = 'post', maxlen = 29)) 
hash_vectors_deeplsh = convert_to_hamming(prediction_deeplsh)
_ = pd.Series(np.arange(limit)).apply(lambda x : near_duplicates_for_runtime(4, 16, 8, x, hash_vectors_deeplsh, hash_tables_deeplsh))

CPU times: user 28.9 s, sys: 2.63 s, total: 31.5 s
Wall time: 41.1 s


## 3.3. baseline

In [13]:
with open('Hash-Tables/hash_tables_baseline.pkl', 'rb') as f:
    hash_tables_baseline = pickle.load(f)

In [14]:
%%time
prediction_baseline = intermediate_model_baseline.predict(pad_sequences(df[:limit]['rankFrames'], padding = 'post', truncating = 'post', maxlen = 29)) 
hash_vectors_baseline = convert_to_hamming(prediction_baseline)
_ = pd.Series(np.arange(limit)).apply(lambda x : near_duplicates_for_runtime(4, 16, 8, x, hash_vectors_baseline, hash_tables_baseline))

CPU times: user 29.5 s, sys: 2.07 s, total: 31.6 s
Wall time: 42.5 s
