## In this section we implement MinHash LSH using shingles instead of indiviadual words 

In [11]:
import numpy as np
import csv
import pandas
from datasketch import MinHash, MinHashLSH

In [12]:
# Shingle generators 
# Arguments : Message string, shingle size {in words}
# Returns : All shingles formed with k words
def shingle_generator(message, k):
    message = message.split(' ')
    item_set = []
    for i in range(0, len(message)-k):
        shingle = ""
        for j in range(0, k):
            shingle = shingle + message[i+j] + " "
        item_set.append(shingle)
    return item_set

In [31]:
# dump the CSV file into a numpy array. 
csv_reader = pandas.read_csv('data/string_study.csv', delimiter=',', header=None)
final_arr = csv_reader.values
final_arr = np.array(final_arr)

In [32]:
# extract all the raw_messsages from the error logs
raw_msgs = final_arr[0:, -1]
raw_msgs.shape
# raw_msgs[1:2] 

(34445,)

### The shingles test case is shown on the "shingle.ipynb" notebook

In [75]:
# Generate the set of words of all raw messages
message_set = []
%time

for item in raw_msgs:
#     set_of_words = set(item.split(' '))
    # Shingle size is set to 3 words
    set_of_words = set(shingle_generator(item, 2))
    message_set.append(set_of_words)
message_set[1:2]
len(message_set)

CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 17.9 µs


34445

In [84]:
# Generate MinHash instances for all messages
m = []
for i in range (0, len(message_set)):
    m_temp = MinHash(num_perm=128)
    m.append(m_temp)
len(m)

34445

In [85]:
i = 0
for message in message_set:
    for word in message:
        m[i].update(word.encode('utf-8'))
    i = i + 1

In [93]:
# Create an LSH index 
# %time
lsh = MinHashLSH(threshold=0.01, num_perm=128)
count = 0
for i in range (0, len(m)):
    string = "m" + str(i)
    lsh.insert(string, m[i])
    count += 1
print count

34445


In [94]:
# Find all the messages similar to the fist message with a specified threshold
%time
result = lsh.query(m[1])
print raw_msgs[1]
len(result)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs
Rally tired waiting 1440.00 seconds for Server rally-9db3-aPln:176b6fd3-8a2d-4b60-a5bd-73575c4ad161 to become ('ACTIVE') current status BUILD


7717

In [95]:
%time
len(result)

CPU times: user 13 µs, sys: 0 ns, total: 13 µs
Wall time: 23.8 µs


7717

In [100]:
with open('min_hash_lsh_shingles_128_pointzero1.txt', 'w') as f:
    for item in result:
        f.write("%s\n" % item)

In [101]:
"m0" in result

False

In [102]:
"m1" in result

True

In [103]:
"m14037" in result

True