# Configuration
## Import needed librairies and dependencies

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict
from collections import Counter

In [2]:
pip install pip install logpai

Note: you may need to restart the kernel to use updated packages.


## Downloading raw data
We read the log data from a GitHub repository (

In [6]:
# Read zipped data files and store them in Colab
# HDFS_log_file_zipped = 'https://zenodo.org/record/8196385/files/HDFS_v1.zip?download=1'
# BGL_log_file_zipped = 'https://zenodo.org/record/8196385/files/BGL.zip?download=1'
# !wget $HDFS_log_file_zipped
# !wget $BGL_log_file_zipped

In [7]:
# Decompress zipped files
# !unzip -q dataset/HDFS_v1.zip -d dataset/ -y
# !unzip -q dataset/BGL.zip -d dataset/ -y

## Parse raw data

In [2]:
from logparser.Drain import LogParser

input_dir = 'dataset/HDFS' # The input directory of log file
output_dir = 'result/HDFS'  # The output directory of parsing results
log_file = 'HDFS.log'  # The input log file name
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'  # HDFS log format

regex      = [
    r'blk_(|-)[0-9]+' , # Block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
]

st = 0.5  # Similarity threshold
depth = 4  # Depth of all leaf nodes

parser = LogParser(log_format, indir=input_dir, outdir=output_dir,  depth=depth, st=st, rex=regex)
parser.parse(log_file)

Parsing file: dataset/HDFS/HDFS.log
Total lines:  11175629
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3

In [7]:
# Use files in Colab
struct_log_file = 'result/HDFS/HDFS.log_structured.csv'
label_file = 'dataset/HDFS/preprocessed/anomaly_label.csv'
struct_log = pd.read_csv(struct_log_file, engine='c', na_filter=False, memory_map=True)
label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)

## Exploring the data

In [8]:
print(type(struct_log))
print(struct_log.shape)
struct_log.head(5)

<class 'pandas.core.frame.DataFrame'>
(11175629, 10)


Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate,ParameterList
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-1608999687919862906', '/10.250.19.102:5..."
1,2,81109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,['/mnt/hadoop/mapred/system/job_200811092030_0...
2,3,81109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-1608999687919862906', '/10.250.10.6:405..."
3,4,81109,203519,145,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-1608999687919862906', '/10.250.14.224:4..."
4,5,81109,203519,145,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_-1608999687919...,d38aa58d,PacketResponder <*> for block <*> <*>,"['1', 'blk_-1608999687919862906 terminating']"


In [9]:
print(type(label_data))
print(label_data.shape)
label_data.head(5)

<class 'pandas.core.frame.DataFrame'>
(575061, 2)


Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal


In [10]:
struct_log.EventId.describe()
#struct_log.EventId.nunique()
#struct_log.EventId.value_counts()

count     11175629
unique          48
top       09a53393
freq       1723232
Name: EventId, dtype: object

# Preprocessing

## Extracting the event sequence for each block ID

Using a regular expression to dind the block IDs in each log line, then producing a list of event (i.e., a event sequence) for each block ID.

In [10]:
data_dict = OrderedDict()
for idx, row in struct_log.iterrows():
    blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
    blkId_set = set(blkId_list)
    for blk_Id in blkId_set:
        if not blk_Id in data_dict:
            data_dict[blk_Id] = []
        data_dict[blk_Id].append(row['EventId'])
data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])

print(type(data_df))
print(data_df.shape)
data_df.head(5)

<class 'pandas.core.frame.DataFrame'>
(575061, 2)


Unnamed: 0,BlockId,EventSequence
0,blk_-1608999687919862906,"[09a53393, 3d91fa85, 09a53393, 09a53393, d38aa..."
1,blk_7503483334202473044,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa..."
2,blk_-3544583377289625738,"[09a53393, 3d91fa85, 09a53393, 09a53393, d38aa..."
3,blk_-9073992586687739851,"[09a53393, 3d91fa85, 09a53393, 09a53393, d38aa..."
4,blk_7854771516489510256,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa..."


## Merging the label with the event sequence data
Merging the event sequence data with the label data by matching the block IDs.

In [12]:
label_data_indexed = label_data.set_index('BlockId')
label_dict = label_data_indexed['Label'].to_dict()
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

data_df.head(5)

data_df.to_csv('result/HDFS/HDFS_ReadToExploitData.csv', index=False)

## Spliting the data into training and testing subsets
We split the data into 70% training data and 30% testing data.

In [13]:
def _split_data(x_data, y_data, train_ratio=0.5):
    pos_idx = y_data > 0
    x_pos = x_data[pos_idx]
    y_pos = y_data[pos_idx]
    x_neg = x_data[~pos_idx]
    y_neg = y_data[~pos_idx]
    train_pos = int(train_ratio * x_pos.shape[0])
    train_neg = int(train_ratio * x_neg.shape[0])
    x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
    y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
    x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
    y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])

    return (x_train, y_train), (x_test, y_test)

Suffle and split the data into 70% training and 30% testing data.

In [14]:
# Shuffle the data
data_df = data_df.sample(frac=1).reset_index(drop=True)
data_df.head(5)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-6365688489215642303,"[09a53393, 3d91fa85, 09a53393, 09a53393, d38aa...",0
1,blk_373277261817055471,"[3d91fa85, 09a53393, 09a53393, 09a53393, d38aa...",0
2,blk_-6560606358175300596,"[09a53393, 09a53393, 09a53393, 3d91fa85, d38aa...",0
3,blk_89728051842830064,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa...",0
4,blk_-5476875544148884281,"[09a53393, 09a53393, 09a53393, 3d91fa85, d38aa...",0


In [15]:
# Split train and test data
train_ratio = 0.7
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
    data_df['Label'].values, train_ratio)

In [16]:
num_train = x_train.shape[0]
num_test = x_test.shape[0]
num_total = num_train + num_test
num_train_pos = sum(y_train)
num_test_pos = sum(y_test)
num_pos = num_train_pos + num_test_pos

print('Total: {} instances, {} anomaly, {} normal' \
      .format(num_total, num_pos, num_total - num_pos))
print('Train: {} instances, {} anomaly, {} normal' \
      .format(num_train, num_train_pos, num_train - num_train_pos))
print('Test: {} instances, {} anomaly, {} normal\n' \
      .format(num_test, num_test_pos, num_test - num_test_pos))

#print(type(x_train))
print('====== x_train (first five lines) ======')
print(x_train[:5])

print('====== y_train (first five lines) ======')
print(y_train[:5])

Total: 575061 instances, 16838 anomaly, 558223 normal
Train: 402542 instances, 11786 anomaly, 390756 normal
Test: 172519 instances, 5052 anomaly, 167467 normal

[list(['3d91fa85', '09a53393', '09a53393', '09a53393', '5d5de21c', '5d5de21c', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '728076ac', '5d5de21c', '40651754', '5d5de21c', '73c2ec69', 'd6b7b743', '09a53393', 'dba996ef', '626085d5', '32777b38', '32777b38', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', 'dba996ef'])
 list(['09a53393', '09a53393', '09a53393', '3d91fa85', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '5d5de21c', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', 'dba996ef', '2e68ccc3'])
 list(['09a53393', '3d91fa85', '09a53393', '09a53393', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '5d5de21c', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', 'dba996ef', 