In [1]:
import os
import sys
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from tensorflow import keras 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [3]:
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'python-packages/'))
from deep_hashing_models import *
from similarities import *
from lsh_search import *

# 1. Data Preparation

In [4]:
data_repo = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))),'data/')

In [5]:
df = pd.read_csv(data_repo + 'stack_traces.csv', index_col = [0])
df['stackTraceCusto'] = df['stackTraceCusto'].apply(lambda x : x.replace('\r',''))
df['stackTraceCusto'] = df['stackTraceCusto'].apply(lambda x : x.replace('\na','\n'))
df['listStackTrace'] = df['stackTraceCusto'].apply(lambda x : x.replace('\n', ' ').strip().split(' '))

In [6]:
df_distinct_stacks = pd.read_csv(data_repo + 'frequent_stack_traces.csv', index_col = [0])
df_distinct_stacks['listStackTrace'] = df_distinct_stacks['stackTraceCusto'].apply(lambda x : x.replace('\n', ' ').strip().split(' '))

In [7]:
df_measures = pd.read_csv(data_repo + 'similarity-measures-pairs.csv', index_col = [0])

In [8]:
n_stacks = df_distinct_stacks.shape[0]
n_stacks

1000

In [9]:
vectorizer_bag_of_frames = CountVectorizer(token_pattern = r"(?u)\b[a-zA-Z0-9_.]{2,}\b")
s = df_distinct_stacks['stackTraceCusto'].apply(lambda x : x.replace('\n',' '))
s = s.apply(lambda x : x.replace('$',''))
s = s.apply(lambda x : x.replace('/',''))
s = s.apply(lambda x : x.replace('<',''))
s = s.apply(lambda x : x.replace('>',''))
X_bag_of_frames = vectorizer_bag_of_frames.fit_transform(list(s)).toarray()
df_bag_of_frames = pd.DataFrame(data = X_bag_of_frames, columns = vectorizer_bag_of_frames.get_feature_names())

In [10]:
limit = 100000
data_test = vectorizer_bag_of_frames.transform(df['stackTraceCusto'][:limit])
data_test_df = pd.DataFrame.sparse.from_spmatrix(data_test)
data_test.shape

(100000, 2249)

In [11]:
vectorizer_tf_idf = TfidfVectorizer(token_pattern = r"(?u)\b[a-zA-Z0-9_.]{2,}\b")
s = df_distinct_stacks['stackTraceCusto'].apply(lambda x : x.replace('\n',' '))
s = s.apply(lambda x : x.replace('$',''))
s = s.apply(lambda x : x.replace('/',''))
s = s.apply(lambda x : x.replace('<',''))
s = s.apply(lambda x : x.replace('>',''))
X_tf_idf = vectorizer_tf_idf.fit_transform(list(s)).toarray()
df_tf_idf = pd.DataFrame(data = X_tf_idf, columns = vectorizer_tf_idf.get_feature_names())

In [12]:
data_test_tfidf = vectorizer_tf_idf.transform(df['stackTraceCusto'][:limit])
data_test_tfidf_df = pd.DataFrame.sparse.from_spmatrix(data_test_tfidf)
data_test_tfidf.shape

(100000, 2249)

# 2. Load deeplsh and baseline models

In [13]:
intermediate_model_deeplsh  = keras.models.load_model('Models/model-deep-lsh.model')
intermediate_model_baseline = keras.models.load_model('Models/model-baseline.model')



# 3. Runtime comparison

## 3.1. Brute force method 

In [18]:
%%time
sim_tfidf = data_test_df[:limit].apply(lambda x : cosine_similarity_df(x, df_tf_idf, rowIndex(x)), axis = 1)

CPU times: user 2h 21min 50s, sys: 6.15 s, total: 2h 21min 56s
Wall time: 2h 22min 32s


## 3.2. DeepLSH

In [14]:
with open('Hash-Tables/hash_tables_deeplsh.pkl', 'rb') as f:
    hash_tables_deeplsh = pickle.load(f)

In [15]:
%%time
prediction_deeplsh = intermediate_model_deeplsh.predict(data_test_tfidf_df.values)
hash_vectors_deeplsh = convert_to_hamming(prediction_deeplsh)
_ = pd.Series(np.arange(limit)).apply(lambda x : near_duplicates_for_runtime(8, 8, 8, x, hash_vectors_deeplsh, hash_tables_deeplsh))

CPU times: user 15.3 s, sys: 1.57 s, total: 16.9 s
Wall time: 19 s


## 3.3. baseline

In [16]:
with open('Hash-Tables/hash_tables_baseline.pkl', 'rb') as f:
    hash_tables_baseline = pickle.load(f)

In [17]:
%%time
prediction_baseline = intermediate_model_baseline.predict(data_test_tfidf_df.values)
hash_vectors_baseline = convert_to_hamming(prediction_baseline)
_ = pd.Series(np.arange(limit)).apply(lambda x : near_duplicates_for_runtime(8, 8, 8, x, hash_vectors_baseline, hash_tables_baseline))

CPU times: user 16.4 s, sys: 1.3 s, total: 17.7 s
Wall time: 17.5 s
