In [1]:
import os
import sys
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from tensorflow import keras 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [2]:
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'python-packages/'))
from deep_hashing_models import *
from similarities import *
from lsh_search import *

# 1. Data Preparation

In [3]:
data_repo = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))),'data/')

In [4]:
df = pd.read_csv(data_repo + 'stack_traces.csv', index_col = [0])
df['stackTraceCusto'] = df['stackTraceCusto'].apply(lambda x : x.replace('\r',''))
df['stackTraceCusto'] = df['stackTraceCusto'].apply(lambda x : x.replace('\na','\n'))
df['listStackTrace'] = df['stackTraceCusto'].apply(lambda x : x.replace('\n', ' ').strip().split(' '))

In [5]:
df_distinct_stacks = pd.read_csv(data_repo + 'frequent_stack_traces.csv', index_col = [0])
df_distinct_stacks['listStackTrace'] = df_distinct_stacks['stackTraceCusto'].apply(lambda x : x.replace('\n', ' ').strip().split(' '))

In [6]:
df_measures = pd.read_csv(data_repo + 'similarity-measures-pairs.csv', index_col = [0])

In [7]:
n_stacks = df_distinct_stacks.shape[0]
n_stacks

1000

In [8]:
vectorizer_bag_of_frames = CountVectorizer(token_pattern = r"(?u)\b[a-zA-Z0-9_.]{2,}\b")
s = df_distinct_stacks['stackTraceCusto'].apply(lambda x : x.replace('\n',' '))
s = s.apply(lambda x : x.replace('$',''))
s = s.apply(lambda x : x.replace('/',''))
s = s.apply(lambda x : x.replace('<',''))
s = s.apply(lambda x : x.replace('>',''))
X_bag_of_frames = vectorizer_bag_of_frames.fit_transform(list(s)).toarray()
df_bag_of_frames = pd.DataFrame(data = X_bag_of_frames, columns = vectorizer_bag_of_frames.get_feature_names())

In [9]:
limit = 100000
data_test = vectorizer_bag_of_frames.transform(df['stackTraceCusto'][:limit])
data_test.shape

(100000, 2249)

In [10]:
dict_idf_frames = df_bag_of_frames.sum(axis = 0).apply(lambda x : 1 + math.log(df_bag_of_frames.shape[0] / x)).to_dict()

# 2. Load deeplsh and baseline models

In [11]:
intermediate_model_deeplsh  = keras.models.load_model('Models/model-deep-lsh.model')
intermediate_model_baseline = keras.models.load_model('Models/model-baseline.model')



# 3. Runtime comparison

## 3.1. Brute force method 

In [11]:
result = %timeit -n1 -r1 -o df[:limit].apply(lambda x : lerch_df(x['listStackTrace'], df_distinct_stacks['listStackTrace'], rowIndex(x), dict_idf_frames), axis = 1)

51min 58s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [12]:
result

<TimeitResult : 51min 58s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

## 3.2. DeepLSH

In [12]:
with open('Hash-Tables/hash_tables_deeplsh.pkl', 'rb') as f:
    hash_tables_deeplsh = pickle.load(f)

In [13]:
%%time
prediction_deeplsh = intermediate_model_deeplsh.predict(data_test)
hash_vectors_deeplsh = convert_to_hamming(prediction_deeplsh)
_ = pd.Series(np.arange(limit)).apply(lambda x : near_duplicates_for_runtime(4, 16, 16, x, hash_vectors_deeplsh, hash_tables_deeplsh))

CPU times: user 23.5 s, sys: 1.92 s, total: 25.5 s
Wall time: 25.9 s


## 3.3. baseline

In [None]:
with open('Hash-Tables/hash_tables_baseline.pkl', 'rb') as f:
    hash_tables_baseline = pickle.load(f)

In [None]:
%%time
prediction_baseline = intermediate_model_baseline.predict(data_test)
hash_vectors_baseline = convert_to_hamming(prediction_baseline)
_ = pd.Series(np.arange(limit)).apply(lambda x : near_duplicates_for_runtime(8, 8, 16, x, hash_vectors_baseline, hash_tables_baseline))