In [1]:
import pandas
import csv 
import numpy as np
from datasketch import MinHash, MinHashLSH

In [2]:
# dump the CSV file into a numpy array. 
csv_reader = pandas.read_csv('data/string_study.csv', delimiter=',', header=None)
final_arr = csv_reader.values
final_arr = np.array(final_arr)

In [3]:
final_arr[1:4]

array([['1524387149611', 'attach-volume', 'wig_project_003',
        "2018-04-22 10:52:29.611 17979 ERROR rallytester.rallytester [-] [nova attach-volume wig_project_003] Task failed:  Rally tired waiting 1440.00 seconds for Server rally-9db3-aPln:176b6fd3-8a2d-4b60-a5bd-73575c4ad161 to become ('ACTIVE') current status BUILD",
        '2018-04-22 08:00:00', "waiting for Server to become ('ACTIVE')",
        "Rally tired waiting 1440.00 seconds for Server rally-9db3-aPln:176b6fd3-8a2d-4b60-a5bd-73575c4ad161 to become ('ACTIVE') current status BUILD"],
       ['1524387185889', 'boot-from-volume-linux', 'gva_shared_016',
        '2018-04-22 10:53:05.889 25667 ERROR rallytester.rallytester [-] [nova boot-from-volume-linux gva_shared_016] Task failed:  Quota exceeded for cores, instances: Requested 1, 1, but already used 10, 10 of 10, 10 cores, instances (HTTP 403) (Request-ID: req-23b5fe81-0bab-4f42-84e9-97e3e6847c04)',
        '2018-04-22 08:00:00',
        'Quota exceeded for cores, inst

In [4]:
# extract all the raw_messsages from the error logs
raw_msgs = final_arr[0:, -1]
raw_msgs.shape
raw_msgs[1:2]

array(["Rally tired waiting 1440.00 seconds for Server rally-9db3-aPln:176b6fd3-8a2d-4b60-a5bd-73575c4ad161 to become ('ACTIVE') current status BUILD"],
      dtype=object)

In [5]:
# Generate the set of words of all raw messages
message_set = []
for item in raw_msgs:
    set_of_words = set(item.split(' '))
    message_set.append(set_of_words)
message_set[1:2]

[{"('ACTIVE')",
  '1440.00',
  'BUILD',
  'Rally',
  'Server',
  'become',
  'current',
  'for',
  'rally-9db3-aPln:176b6fd3-8a2d-4b60-a5bd-73575c4ad161',
  'seconds',
  'status',
  'tired',
  'to',
  'waiting'}]

In [6]:
# Generate MinHash instances for all messages
m = []
for i in range (0, len(message_set)):
    m_temp = MinHash(num_perm=128)
    m.append(m_temp)

In [7]:
i = 0
for message in message_set:
    for word in message:
        m[i].update(word.encode('utf-8'))
    i = i + 1

In [19]:
# Create an LSH index 
%time
lsh = MinHashLSH(threshold=.01, num_perm=128)
for i in range (1, len(m)):
    string = "m" + str(i)
    lsh.insert(string, m[i])

CPU times: user 30 µs, sys: 1 µs, total: 31 µs
Wall time: 33.1 µs


In [20]:
# Find all the messages similar to the fist message with a specified threshold
result = lsh.query(m[1])
len(result)

20610

In [21]:
with open('min_hash_lsh_128_pointzero1.txt', 'w') as f:
    for item in result:
        f.write("%s\n" % item)

In [22]:
"m1" in result

True

In [23]:
"m5217" in result

True

In [24]:
print raw_msgs[1]
print raw_msgs[14037]


Rally tired waiting 1440.00 seconds for Server rally-9db3-aPln:176b6fd3-8a2d-4b60-a5bd-73575c4ad161 to become ('ACTIVE') current status BUILD
Rally tired waiting for Volume rally-a7d8-aHsP:889aec9f-0088-4cff-b6e2-a43e1715534a to become ('AVAILABLE') current status CREATING
