# Configuration
## Import needed librairies and dependencies

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict
from collections import Counter

In [2]:
pip install pip install logpai

Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting logpai
  Downloading logpai-1.0.0-py3-none-any.whl (125 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.6/125.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: install, logpai
Successfully installed install-1.3.5 logpai-1.0.0
Note: you may need to restart the kernel to use updated packages.


## Downloading raw data
We read the log data from a GitHub repository (

In [3]:
# Read zipped data files and store them in Colab
# HDFS_log_file_zipped = 'https://zenodo.org/record/8196385/files/HDFS_v1.zip?download=1'
# BGL_log_file_zipped = 'https://zenodo.org/record/8196385/files/BGL.zip?download=1'
# !wget $HDFS_log_file_zipped
# !wget $BGL_log_file_zipped

In [None]:
# Decompress zipped files
# !unzip -q dataset/HDFS_v1.zip -d dataset/ -y
# !unzip -q dataset/BGL.zip -d dataset/ -y

## Parse raw data

In [3]:
from logparser.Drain import LogParser

input_dir = 'dataset/' # The input directory of log file
output_dir = 'result/'  # The output directory of parsing results
log_file = 'HDFS.log'  # The input log file name
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'  # HDFS log format
# log_format = '<Date> <Time> <Level>:<Content>' # Define log format to split message fields
# Regular expression list for optional preprocessing (default: [])

regex      = [
    r'blk_(|-)[0-9]+' , # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
]

st = 0.5  # Similarity threshold
depth = 4  # Depth of all leaf nodes

parser = LogParser(log_format, indir=input_dir, outdir=output_dir,  depth=depth, st=st, rex=regex)
parser.parse(log_file)

Parsing file: dataset/HDFS.log


FileNotFoundError: [Errno 2] No such file or directory: 'dataset/HDFS.log'

In [7]:
# Use files in Colab
struct_log_file = 'result/HDFS/HDFS.log_structured.csv'
label_file = 'result/HDFS/HDFS.anomaly_label.csv'
struct_log = pd.read_csv(struct_log_file, engine='c', na_filter=False, memory_map=True)
label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)

FileNotFoundError: [Errno 2] No such file or directory: 'result/HDFS/HDFS.anomaly_label.csv'

## Exploring the data

In [8]:
print(type(struct_log))
print(struct_log.shape)
struct_log.head(5)

<class 'pandas.core.frame.DataFrame'>
(11175629, 10)


Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate,ParameterList
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-1608999687919862906', '/10.250.19.102:5..."
1,2,81109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,['/mnt/hadoop/mapred/system/job_200811092030_0...
2,3,81109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-1608999687919862906', '/10.250.10.6:405..."
3,4,81109,203519,145,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-1608999687919862906', '/10.250.14.224:4..."
4,5,81109,203519,145,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_-1608999687919...,d38aa58d,PacketResponder <*> for block <*> <*>,"['1', 'blk_-1608999687919862906 terminating']"


In [16]:
print(type(label_data))
print(label_data.shape)
label_data.head(5)

<class 'pandas.core.frame.DataFrame'>
(575061, 2)


Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal


In [10]:
struct_log.EventId.describe()
#struct_log.EventId.nunique()
#struct_log.EventId.value_counts()

count     11175629
unique          48
top       09a53393
freq       1723232
Name: EventId, dtype: object

# Preprocessing

## Extracting the event sequence for each block ID

Using a regular expression to dind the block IDs in each log line, then producing a list of event (i.e., a event sequence) for each block ID.

In [11]:
data_dict = OrderedDict()
for idx, row in struct_log.iterrows():
    blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
    blkId_set = set(blkId_list)
    for blk_Id in blkId_set:
        if not blk_Id in data_dict:
            data_dict[blk_Id] = []
        data_dict[blk_Id].append(row['EventId'])
data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])

print(type(data_df))
print(data_df.shape)
data_df.head(5)

<class 'pandas.core.frame.DataFrame'>
(575061, 2)


Unnamed: 0,BlockId,EventSequence
0,blk_-1608999687919862906,"[09a53393, 3d91fa85, 09a53393, 09a53393, d38aa..."
1,blk_7503483334202473044,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa..."
2,blk_-3544583377289625738,"[09a53393, 3d91fa85, 09a53393, 09a53393, d38aa..."
3,blk_-9073992586687739851,"[09a53393, 3d91fa85, 09a53393, 09a53393, d38aa..."
4,blk_7854771516489510256,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa..."


## Merging the label with the event sequence data
Merging the event sequence data with the label data by matching the block IDs.

In [17]:
label_data_indexed = label_data.set_index('BlockId')
label_dict = label_data_indexed['Label'].to_dict()
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

data_df.head(5)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-1697891163467046499,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa...",0
1,blk_-112721580735521816,"[09a53393, 09a53393, 09a53393, 3d91fa85, d38aa...",1
2,blk_-2465335033010119183,"[3d91fa85, 09a53393, 09a53393, 09a53393, 5d5de...",0
3,blk_-7247291679155656679,"[09a53393, 09a53393, 09a53393, 3d91fa85, d38aa...",0
4,blk_630336298767309315,"[3d91fa85, 09a53393, 09a53393, 09a53393, 5d5de...",0


## Spliting the data into training and testing subsets
We split the data into 70% training data and 30% testing data.

In [12]:
def _split_data(x_data, y_data, train_ratio=0.5):
    pos_idx = y_data > 0
    x_pos = x_data[pos_idx]
    y_pos = y_data[pos_idx]
    x_neg = x_data[~pos_idx]
    y_neg = y_data[~pos_idx]
    train_pos = int(train_ratio * x_pos.shape[0])
    train_neg = int(train_ratio * x_neg.shape[0])
    x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
    y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
    x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
    y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])

    return (x_train, y_train), (x_test, y_test)

Suffle and split the data into 70% training and 30% testing data.

In [19]:
# Shuffle the data
data_df = data_df.sample(frac=1).reset_index(drop=True)
data_df.head(5)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_7848861781780967267,"[09a53393, 09a53393, 09a53393, 3d91fa85, d38aa...",0
1,blk_3972504696374393326,"[3d91fa85, 09a53393, 09a53393, 09a53393, 5d5de...",0
2,blk_-7938189790233556774,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa...",0
3,blk_-4282979133728615678,"[3d91fa85, 09a53393, 09a53393, 09a53393, 5d5de...",0
4,blk_-3806429750568355549,"[09a53393, 09a53393, 3d91fa85, 09a53393, d38aa...",0


In [20]:
# Split train and test data
train_ratio = 0.7
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
    data_df['Label'].values, train_ratio)

In [21]:
num_train = x_train.shape[0]
num_test = x_test.shape[0]
num_total = num_train + num_test
num_train_pos = sum(y_train)
num_test_pos = sum(y_test)
num_pos = num_train_pos + num_test_pos

print('Total: {} instances, {} anomaly, {} normal' \
      .format(num_total, num_pos, num_total - num_pos))
print('Train: {} instances, {} anomaly, {} normal' \
      .format(num_train, num_train_pos, num_train - num_train_pos))
print('Test: {} instances, {} anomaly, {} normal\n' \
      .format(num_test, num_test_pos, num_test - num_test_pos))

#print(type(x_train))
print('====== x_train (first five lines) ======')
print(x_train[:5])

print('====== y_train (first five lines) ======')
print(y_train[:5])

Total: 575061 instances, 16838 anomaly, 558223 normal
Train: 402542 instances, 11786 anomaly, 390756 normal
Test: 172519 instances, 5052 anomaly, 167467 normal

[list(['09a53393', '09a53393', '3d91fa85', '09a53393', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '5d5de21c', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', '2e68ccc3', 'dba996ef'])
 list(['09a53393', '3d91fa85', '09a53393', '09a53393', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', '5d5de21c', '5d5de21c', '5d5de21c', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', '2e68ccc3', 'dba996ef'])
 list(['3d91fa85', '09a53393', '09a53393', '09a53393', '5d5de21c', '5d5de21c', '5d5de21c', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3df2680', 'd63ef163', 'd63ef163', 'd63ef163', 'dba996ef', 'dba996ef', '2e68ccc3', 'dba996ef'])
 list(['09a53393', '09a53393', '09a53393', '3d91fa85', 'd38aa58d', 'e3df2680', 'd38aa58d', 'e3d

## Feature extraction

In [22]:
def transform_train_data(X_seq):
    X_counts = []
    for i in range(X_seq.shape[0]):
        event_counts = Counter(X_seq[i])
        X_counts.append(event_counts)
    X_df = pd.DataFrame(X_counts)
    X_df = X_df.fillna(0)
    events = X_df.columns
    X = X_df.values
    return (X, events)

transformed = transform_train_data(x_train)
x_train = transformed[0]
# The events in the training data,
# used later to ingore the events in the test data that is unseen in the training data
events = transformed[1]

print('====== Transformed train data summary ======')
print('Train data shape: {}-by-{}\n'.format(x_train.shape[0], x_train.shape[1]))
print(x_train[:5])

Train data shape: 402542-by-47

[[3. 1. 3. 3. 3. 3. 3. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [3. 1. 3. 3. 3. 3. 3. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [3. 1. 3. 3. 3. 3. 3. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [4. 1. 3. 3. 4. 4. 4. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [4. 1. 3. 3. 4. 3. 4. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [23]:
def transform_test_data(X_seq, events):
    X_counts = []
    for i in range(X_seq.shape[0]):
        event_counts = Counter(X_seq[i])
        X_counts.append(event_counts)
    X_df = pd.DataFrame(X_counts)
    X_df = X_df.fillna(0)
    # treat the counts of the missing events as 0s
    empty_events = set(events) - set(X_df.columns)
    for event in empty_events:
        X_df[event] = [0] * len(X_df)
    X = X_df[events].values
    return X

x_test = transform_test_data(x_test, events)

print('====== Transformed test data summary ======')
print('Test data shape: {}-by-{}\n'.format(x_test.shape[0], x_test.shape[1]))
print(x_test[:5])

Test data shape: 172519-by-47

[[5. 1. 3. 3. 7. 3. 5. 0. 2. 2. 2. 2. 4. 2. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [5. 1. 3. 3. 7. 3. 5. 0. 2. 2. 2. 2. 2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


## Constructing a logistic regression model

In [26]:
lr_classifier = LogisticRegression(penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=1000)

lr_classifier.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=100, max_iter=1000, tol=0.01)

## Evaluate the performance of the logistic regression model

We evaluate the model's performance on the testing set.

In [None]:
## Precision, recall, f1-score, and precission-recall cu