PFR Air France KLM
# Analyse des messages et extration des anomalies
_____

In [395]:
import pandas as pd
import numpy as np
from Reader import Reader
from Parser import Parser
from Analyzer import Analyzer
import multiprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import linear_kernel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## Import et parsing des logs

In [361]:
reader = Reader()

pool = multiprocessing.Pool()

# Gathering all logs in one list
logs = reader.read_dir('../log_tfidf/*.log')


def parse(log):
    return [log.split(' ', 3)[0][:10], log.split(' ', 3)[1], log.split(' ', 3)[3].replace('\n','')]


parsed_logs = list(pool.map(parse, logs))

pool.close()
pool.join()

../log_tfidf/qvidbkartn05.20170428.log
../log_tfidf/qvidbkartn04.20170407.log
../log_tfidf/qvidbkartn03.20170407.log
../log_tfidf/qvidbkartn03.20170412.log
../log_tfidf/qvidbkartn03.20170406.log
../log_tfidf/qvidbkartn04.20170412.log
../log_tfidf/qvidbkartn04.20170406.log
../log_tfidf/qvidbkartn05.20170429.log
../log_tfidf/qvidbkartn04.20170404.log
../log_tfidf/qvidbkartn03.20170405.log
../log_tfidf/qvidbkartn04.20170405.log
../log_tfidf/qvidbkartn05.20170412.log
../log_tfidf/qvidbkartn04.20170401.log
../log_tfidf/qvidbkartn04.20170429.log
../log_tfidf/qvidbkartn03.20170401.log
../log_tfidf/qvidbkartn03.20170429.log
../log_tfidf/qvidbkartn03.20170428.log
../log_tfidf/qvidbkartn04.20170428.log
../log_tfidf/qvidbkartn03.20170402.log
../log_tfidf/qvidbkartn03.20170403.log
../log_tfidf/qvidbkartn02.20170404.log
../log_tfidf/qvidbkarro01.20170426.log
../log_tfidf/qvidblogrl01.20170422.log
../log_tfidf/qvidbkarro02.20170412.log
../log_tfidf/qvidblogrl01.20170423.log
../log_tfidf/qvidbmqtst01

## TF-IDF pour chaque server à traver le temps

In [329]:
features = {}
tfidf = {}

df_parsed_logs = pd.DataFrame(data=parsed_logs, columns=[
                              'Date', 'Hostname', 'Message'])

df_grouped_by_hostname = df_parsed_logs.groupby('Hostname')

for name_h, group_h in df_grouped_by_hostname:
    df_grouped_by_date = group_h.groupby('Date')
    messages = []
    for name_d, group_d in df_grouped_by_date:
        messages.append('\n'.join(group_d['Message'].values))
    
    vect = CountVectorizer(token_pattern='(.*)\n')
    tr = TfidfTransformer()
    
    vect_hostname = vect.fit_transform(messages)
    features[hostname] = vect.get_feature_names()
    tfidf[hostname] = tr.fit_transform(vect_hostname)

In [330]:
def print_tfidf_matrix(tfidf, features):
    return pd.DataFrame(tfidf.A, columns=features)

In [331]:
server = list(tfif.keys())[0]
print(server)
print_tfidf_matrix(tfidf[server], features[server])

qvidbmwadh01


Unnamed: 0,root : tty=unknown ; pwd=/tmp ; user=root ; command=list,tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/tech/local/backup/jobs/xobackup,tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/usr/bin/dsmc archive -archmc=ar1y_l /var/log/traces/ /var/log/traces/sar/sudo.log*,tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/usr/bin/dsmc archive -archmc=ar3m_l /var/log/traces/sar/bsm_log/* /var/log/traces/sar/audit.log*,tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=list,(root) cmd (/etc/telegraf/scripts/telegraf-configure.sh),(root) cmd (/exploit/local/bin/sar_secondary_group.sh > /dev/null 2>&1),(root) cmd (/tech/local/bin/sssd_local_accounts.sh > /dev/null 2>&1),(root) cmd (/tech/local/sbin/servinfo-gen),(root) cmd (/tech/unix/uid/collecte_uid_gid_pjid.sh > /dev/null 2>&1),...,will run job `cron.daily' in 32 min.,will run job `cron.daily' in 37 min.,will run job `cron.daily' in 41 min.,will run job `cron.daily' in 47 min.,will run job `cron.daily' in 48 min.,will run job `cron.daily' in 49 min.,will run job `cron.daily' in 8 min.,will run job `cron.monthly' in 75 min.,will run job `cron.weekly' in 57 min.,will run job `cron.weekly' in 59 min.
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.003502,0.003502,0.003502,0.010506,0.003502,0.004949,0.0,0.007246,0.007004,...,0.009209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.005289,0.005289,0.005289,0.015866,0.005289,0.007474,0.0,0.0,0.010577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013907,0.0,0.0
6,0.0,0.005167,0.005167,0.005167,0.0155,0.005167,0.007301,0.0,0.01069,0.010333,...,0.0,0.0,0.0,0.013586,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.005313,0.005313,0.005313,0.01594,0.005313,0.007509,0.0,0.0,0.010627,...,0.0,0.01223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.004982,0.004982,0.004982,0.014946,0.004982,0.0,0.0,0.0,0.009964,...,0.0,0.0,0.0,0.0,0.0,0.013101,0.0,0.0,0.0,0.013101
9,0.0,0.005418,0.005418,0.005418,0.016255,0.005418,0.0,0.0,0.0,0.010837,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get top rare messages

In [332]:
def top_rare_feats(Xtr, features, top_n=25):
    scores = np.max(Xtr - Xtr.mean(axis = 0), axis=0).tolist()[0]
    topn_ids = np.argsort(scores)[::-1][:top_n]
    top_feats = [(features[i], scores[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'scarcity']
    return df

In [333]:
server = list(tfidf.keys())[0]
top_rare_feats(tfidf[server], features[server], top_n=25)

Unnamed: 0,feature,scarcity
0,bonjour,0.555109
1,vmciutil: updating context id from 0xf35e3198 ...,0.538601
2,0.0.0.0 0628 08 no_sys_peer,0.3904
3,0.0.0.0 0615 05 clock_sync,0.382669
4,0.0.0.0 061c 0c clock_step +0.744415 s,0.362636
5,0.0.0.0 0613 03 spike_detect +0.725244 s,0.362636
6,0.0.0.0 0613 03 spike_detect +4.687027 s,0.362636
7,0.0.0.0 061c 0c clock_step +4.686983 s,0.362636
8,0.0.0.0 0613 03 spike_detect +0.234536 s,0.307006
9,0.0.0.0 061c 0c clock_step +0.239434 s,0.307006


## Extract anomalous logs

In [362]:
threshold = 0.5

df_features = top_rare_feats(tfidf[server], features[server], top_n=25)

anormal_messages = df_features[df_features['scarcity'] > threshold]['feature'].values

df_parsed_log_server = df_parsed_logs[df_parsed_logs['Hostname'] == server]

df_parsed_log_server[df_parsed_log_server['Message'].str.lower().isin(anormal_messages)]

Unnamed: 0,Date,Hostname,Message
22582,2017-04-01,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
47549,2017-04-04,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
47555,2017-04-04,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
47561,2017-04-04,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
47562,2017-04-04,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
47563,2017-04-04,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
47565,2017-04-04,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
47575,2017-04-04,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
62435,2017-04-05,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...
62441,2017-04-05,qvidbmwadh01,VMCIUtil: Updating context id from 0xf35e3198 ...


In [363]:
features[server]

['    root : tty=unknown ; pwd=/tmp ; user=root ; command=list',
 '  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/tech/local/backup/jobs/xobackup',
 '  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/usr/bin/dsmc archive -archmc=ar1y_l /var/log/traces/ /var/log/traces/sar/sudo.log*',
 '  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/usr/bin/dsmc archive -archmc=ar3m_l /var/log/traces/sar/bsm_log/* /var/log/traces/sar/audit.log*',
 '  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=list',
 '(root) cmd (/etc/telegraf/scripts/telegraf-configure.sh)',
 '(root) cmd (/exploit/local/bin/sar_secondary_group.sh > /dev/null 2>&1)',
 '(root) cmd (/tech/local/bin/sssd_local_accounts.sh > /dev/null 2>&1)',
 '(root) cmd (/tech/local/sbin/servinfo-gen)',
 '(root) cmd (/tech/unix/uid/collecte_uid_gid_pjid.sh > /dev/null 2>&1)',
 '(root) cmd (/usr/lib64/sa/sa1 1 1)',
 '(root) cmd (/us

In [397]:
test = features[server]



tagged_docs=[]
for i, doc in enumerate(features[server]):
    tagged_docs.append(TaggedDocument(words = doc, tags = [i]))
    
    
model = Doc2Vec(tagged_docs, size=200)

In [399]:
tagged_docs

[TaggedDocument(words='    root : tty=unknown ; pwd=/tmp ; user=root ; command=list', tags=[0]),
 TaggedDocument(words='  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/tech/local/backup/jobs/xobackup', tags=[1]),
 TaggedDocument(words='  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/usr/bin/dsmc archive -archmc=ar1y_l /var/log/traces/ /var/log/traces/sar/sudo.log*', tags=[2]),
 TaggedDocument(words='  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=/usr/bin/dsmc archive -archmc=ar3m_l /var/log/traces/sar/bsm_log/* /var/log/traces/sar/audit.log*', tags=[3]),
 TaggedDocument(words='  tsmbck : tty=unknown ; pwd=/opt/tivoli/tsm/hometsmbck ; user=root ; command=list', tags=[4]),
 TaggedDocument(words='(root) cmd (/etc/telegraf/scripts/telegraf-configure.sh)', tags=[5]),
 TaggedDocument(words='(root) cmd (/exploit/local/bin/sar_secondary_group.sh > /dev/null 2>&1)', tags=[6]),
 TaggedDocument(words='(ro

In [401]:
model.most_similar(['    root : tty=unknown ; pwd=/tmp ; user=root ; command=list'])

KeyError: "word '    root : tty=unknown ; pwd=/tmp ; user=root ; command=list' not in vocabulary"