In [117]:
import pandas as pd
from Reader import Reader
from Parser import Parser
from logmining import LogCountVectorizer, LfdfhfTransformer, LogScorer

reader = Reader()
parser = Parser()

# Gathering all logs in one list
logs = reader.read_dir('../log_tfidf/*')

# Parse logs into Date-Hostname-Message
parsed_logs = parser.parse_all(logs)

# Log Count Vectorizer
lcv = LogCountVectorizer()
df_count = lcv.fit_transform(parsed_logs)

# lfdfhf Transformer
lfdfhf = LfdfhfTransformer()
df_lf, df_df, df_hf = lfdfhf.lfdfhf_transformer(df_count)
df_hf

# Log Scorer
ls = LogScorer()
df_lf_melt, df_df_melt, df_hf_melt = ls.melt_lfdfhf(df_lf, df_df, df_hf)
df_merged = ls.merge_df_lf_hf_df(df_lf_melt, df_df_melt, df_hf_melt)
df_merged = ls.compute_lfidf(df_merged)
df_merged = ls.compute_hfidf(df_merged)

../log_tfidf/qvidbkartn05.20170428.log
../log_tfidf/qvidbkartn04.20170407.log
../log_tfidf/qvidbkartn03.20170407.log
../log_tfidf/qvidbkartn03.20170412.log
../log_tfidf/qvidbkartn03.20170406.log
../log_tfidf/qvidbkartn04.20170412.log
../log_tfidf/qvidbkartn04.20170406.log
../log_tfidf/qvidbkartn05.20170429.log
../log_tfidf/qvidbkartn04.20170404.log
../log_tfidf/qvidbkartn03.20170405.log
../log_tfidf/qvidbkartn04.20170405.log
../log_tfidf/qvidbkartn05.20170412.log
../log_tfidf/qvidbkartn04.20170401.log
../log_tfidf/qvidbkartn04.20170429.log
../log_tfidf/qvidbkartn03.20170401.log
../log_tfidf/qvidbkartn03.20170429.log
../log_tfidf/qvidbkartn03.20170428.log
../log_tfidf/qvidbkartn04.20170428.log
../log_tfidf/qvidbkartn03.20170402.log
../log_tfidf/qvidbkartn03.20170403.log
../log_tfidf/qvidbkartn02.20170404.log
../log_tfidf/qvidbkarro01.20170426.log
../log_tfidf/qvidblogrl01.20170422.log
../log_tfidf/qvidbkarro02.20170412.log
../log_tfidf/qvidblogrl01.20170423.log
../log_tfidf/qvidbmqtst01

In [118]:
df_lfidf = df_merged.sort_values("lfidf", ascending = False)
df_hfidf = df_merged.sort_values("hfidf", ascending = False)

In [119]:
anomalous_top_lfidf  = list(df_lfidf.index.get_level_values("Message"))[:1000]
anomalous_top_hfidf  = list(df_hfidf.index.get_level_values("Message"))[:1000]

In [125]:
from difflib import SequenceMatcher
import itertools
import multiprocessing
import numpy as np

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def log_similarity(pair_s):
    s1_split = pair_s[0].split(' ')
    s2_split = pair_s[1].split(' ')
    l = min(len(s1_split),len(s2_split))
    
    score = []
    n_equal = 0
    for id_w in range(l):
        w1 = s1_split[id_w]
        w2 = s2_split[id_w]
        score.append(similar(w1, w2))
    return np.mean(score)

In [124]:
crossed_anomalous_top_lfidf = list(itertools.product(anomalous_top_lfidf,anomalous_top_lfidf))
crossed_anomalous_top_hfidf = list(itertools.product(anomalous_top_hfidf,anomalous_top_hfidf))

In [126]:
pool = multiprocessing.Pool()
similarity_vect_lfidf = np.array(
    pool.map(log_similarity, crossed_anomalous_top_lfidf))
similarity_vect_hfidf = np.array(
    pool.map(log_similarity, crossed_anomalous_top_hfidf))
pool.close()
pool.join()

ValueError: cannot reshape array of size 1000000 into shape (1000000,1000000)

In [127]:
similarity_matrix_lfidf = similarity_vect_lfidf.reshape(
    len(anomalous_top_lfidf), len(anomalous_top_lfidf))
similarity_matrix_hfidf = similarity_vect_hfidf.reshape(
    len(anomalous_top_lfidf), len(anomalous_top_lfidf))

In [130]:
x = dict(zip(anomalous_top_lfidf,np.mean(1-similarity_matrix_lfidf,axis=0)))
sorted_x = dict(sorted(x.items(), key=operator.itemgetter(1), reverse=True))
list(sorted_x.keys())[1:]

['0.0.0.0 0613 03 spike_detect +0.276669 s',
 '0.0.0.0 0613 03 spike_detect +0.260255 s',
 '0.0.0.0 0613 03 spike_detect +4.687027 s',
 '0.0.0.0 0613 03 spike_detect +0.270265 s',
 '0.0.0.0 0613 03 spike_detect +0.285843 s',
 '0.0.0.0 0613 03 spike_detect +0.725244 s',
 '0.0.0.0 0613 03 spike_detect +0.224266 s',
 '0.0.0.0 0613 03 spike_detect +0.234536 s',
 '0.0.0.0 0613 03 spike_detect +0.244262 s',
 '0.0.0.0 0613 03 spike_detect -0.914971 s',
 '0.0.0.0 0613 03 spike_detect +0.477189 s',
 '0.0.0.0 0613 03 spike_detect +0.920149 s',
 '0.0.0.0 0615 05 clock_sync',
 '0.0.0.0 0628 08 no_sys_peer',
 '0.0.0.0 0618 08 no_sys_peer',
 '0.0.0.0 c615 05 clock_sync',
 'mark --',
 '0.0.0.0 0614 04 freq_mode',
 '3975 2247 - storage service  the controller battery is charging.:  battery 0 controller 0',
 'level dtlb entries: 4kb 512, 2mb 32, 4mb 32',
 '3975 2358 - storage service  the battery charge cycle is complete.:  battery 0 controller 0',
 '0.0.0.0 061c 0c clock_step +4.686983 s',
 '0.0.0.0 0

In [131]:
x = dict(zip(anomalous_top_hfidf,np.mean(1-similarity_matrix_hfidf,axis=0)))
sorted_x = dict(sorted(x.items(), key=operator.itemgetter(1), reverse=True))
list(sorted_x.keys())[1:]

['bonjour',
 '    root : tty=unknown ; pwd=/tmp ; user=root ; command=list',
 '0.0.0.0 0615 05 clock_sync',
 '0.0.0.0 0613 03 spike_detect +0.353743 s',
 '0.0.0.0 0613 03 spike_detect +0.348603 s',
 'pci 0000:00:17.4:   bridge window [mem 0xfc300000-0xfc3fffff]',
 '0.0.0.0 061c 0c clock_step +0.329993 s',
 '0.0.0.0 061c 0c clock_step +0.359610 s',
 'pci 0000:00:17.4: [15ad:07a0] type 01 class 0x060400',
 '0.0.0.0 c618 08 no_sys_peer',
 'pci 0000:00:17.3: res[13]=[io  0x1000-0x0fff] get_res_add_size add_size 1000',
 'pci 0000:00:17.4:   bridge window [mem 0xea800000-0xea8fffff 64bit pref]',
 '2017/04/07 14:50:09 seeked /var/log/messages - &{offset:0 whence:2}',
 'pci 0000:00:17.3: system wakeup disabled by acpi',
 '2017/04/07 14:50:09 seeked /var/log/yum.log - &{offset:0 whence:2}',
 'shutting down',
 'pci 0000:00:17.4: bar 13: failed to assign [io  size 0x1000]',
 'pci 0000:00:17.4: bridge window [io  0x1000-0x0fff] to [bus 17] add_size 1000',
 "job `cron.monthly' started",
 "will run 