In [69]:
from pathlib import Path
import re

import pandas as pd
from logparser import Drain

from raw_log_reader import log_read, file_exists


In [42]:
data_dir = Path("data")
structured_dir = data_dir / "structured"


In [43]:
structured_dir.mkdir(parents=True, exist_ok=True)

## HDFS 2k

In [70]:

hdfs2k_dataset_name = "HDFS_2k"
hdfs2k_dir = data_dir / hdfs2k_dataset_name
hdfs2k_log_name = "HDFS_2k.log"

In [71]:
file_exists(hdfs2k_dir / hdfs2k_log_name)

True

In [72]:
structured_hdfs2k = structured_dir / hdfs2k_dataset_name

In [73]:
structured_hdfs2k.mkdir(parents=True, exist_ok=True)

In [74]:
structured_hdfs2k.exists() and structured_hdfs2k.is_dir()

True

In [75]:
hdfs_logline_format = '<Date> <Time> <NodeID> <Level> <NodeName>: <Content>'  # HDFS log format

In [76]:
# Regular expression list for optional preprocessing (default: [])
hdfs2k_settings = {
    'regex': [
    r'blk_(|-)[0-9]+' , # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
    ],
    'st': 0.5,  # Similarity threshold. Default: 0.4
    'depth': 4  # Depth of all leaf nodes. Default: 4
}


In [None]:

drain_parser_hdfs2k = Drain.LogParser(
    hdfs_logline_format
    , indir=hdfs2k_dir
    , outdir=structured_hdfs2k
    , depth=hdfs2k_settings['depth']
    , st=hdfs2k_settings['st']
    , rex=hdfs2k_settings['regex']
    )


In [78]:
hdfs2k_named_regex = drain_parser_hdfs2k.generate_logformat_regex(drain_parser_hdfs2k.log_format)

In [79]:
hdfs2k_named_regex

(['Date', 'Time', 'NodeID', 'Level', 'NodeName', 'Content'],
 regex.Regex('^(?P<Date>.*?)\\s+(?P<Time>.*?)\\s+(?P<NodeID>.*?)\\s+(?P<Level>.*?)\\s+(?P<NodeName>.*?):\\s+(?P<Content>.*?)$', flags=regex.V0))

In [80]:
drain_parser_hdfs2k.parse(hdfs2k_log_name)

Parsing file: data\HDFS_2k\HDFS_2k.log
Total lines:  2000
Processed 50.0% of log lines.
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.734015]


In [81]:
hdfs2k_df = drain_parser_hdfs2k.df_log.copy()

In [82]:
hdfs2k_df.head(3)

Unnamed: 0,LineId,Date,Time,NodeID,Level,NodeName,Content,EventId,EventTemplate,ParameterList
0,1,81109,203615,148,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_38865049064139...,dc2c74b7,PacketResponder <*> for block <*> terminating,"[1, blk_38865049064139660]"
1,2,81109,203807,222,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_-6952295868487...,dc2c74b7,PacketResponder <*> for block <*> terminating,"[0, blk_-6952295868487656571]"
2,3,81109,204005,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.addStoredBlock: blockMap upd...,5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap upd...,"[10.251.73.220:50010, blk_7128370237687728475,..."


## HDFS v1

In [44]:

dataset_name = "HDFS_v1"
hdfs1_dir = data_dir / dataset_name
hdfs1_log_name = "HDFS.log"

In [45]:
file_exists(hdfs1_dir / hdfs1_log_name)

True

In [46]:
structured_hdfs1 = structured_dir / dataset_name

In [47]:
structured_hdfs1.mkdir(parents=True, exist_ok=True)

In [48]:
structured_hdfs1.exists() and structured_hdfs1.is_dir()

True

the method `parse()` explicitly define the column name `Content` for the log message to be parsed

In [54]:
hdfs_logline_format = '<Date> <Time> <NodeID> <Level> <NodeName>: <Content>'  # HDFS log format


In [55]:
# Regular expression list for optional preprocessing (default: [])
regex      = [
    r'blk_(|-)[0-9]+' , # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
]
st         = 0.5  # Similarity threshold. Default: 0.4
depth      = 4  # Depth of all leaf nodes. Default: 4


In [56]:

drain_parser = Drain.LogParser(hdfs_logline_format
        , indir=hdfs1_dir
        , outdir=structured_hdfs1
        ,  depth=depth
        , st=st
        , rex=regex
    )


In [57]:
named_gerexpr = drain_parser.generate_logformat_regex(drain_parser.log_format)

In [58]:
named_gerexpr

(['Date', 'Time', 'NodeID', 'Level', 'NodeName', 'Content'],
 regex.Regex('^(?P<Date>.*?)\\s+(?P<Time>.*?)\\s+(?P<NodeID>.*?)\\s+(?P<Level>.*?)\\s+(?P<NodeName>.*?):\\s+(?P<Content>.*?)$', flags=regex.V0))

when the flag `regex.V0` is used the behaviour, compatible with the `re` module.

In [59]:
drain_parser.parse(hdfs1_log_name)

Parsing file: data\HDFS_v1\HDFS.log
Total lines:  11175629
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.0% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.1% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.2% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3% of log lines.
Processed 0.3

In [60]:
hdfs1_df = drain_parser.df_log.copy()

In [61]:
hdfs1_df.head(3)

Unnamed: 0,LineId,Date,Time,NodeID,Level,NodeName,Content,EventId,EventTemplate,ParameterList
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"[blk_-1608999687919862906, /10.250.19.102:5410..."
1,2,81109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,[/mnt/hadoop/mapred/system/job_200811092030_00...
2,3,81109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,09a53393,Receiving block <*> src: <*> dest: <*>,"[blk_-1608999687919862906, /10.250.10.6:40524,..."


## Spark Log

In [85]:
spark2k_dataset_name = 'Spark'
spark2k_dir = data_dir / spark2k_dataset_name
spark2k_log_name = 'Spark_2k.log'

In [86]:
file_exists(spark2k_dir / spark2k_log_name)

True

In [89]:
structured_spark2k = structured_dir / spark2k_dataset_name

In [90]:
structured_spark2k.mkdir(parents=True, exist_ok=True)

In [91]:
structured_spark2k.exists() and structured_spark2k.is_dir()

True

In [92]:
spark_log_format = '<Date> <Time> <Level> <Component>: <Content>'

In [93]:
spark2k_settings = {
    'regex': [r'(\d+\.){3}\d+'
            , r'\b[KGTM]?B\b'
            , r'([\w-]+\.){2,}[\w-]+'
            ],
    'st': 0.5,
    'depth': 4
}

In [104]:
drain_parser_spark2k = Drain.LogParser(
    log_format=spark_log_format,
    indir=spark2k_dir,
    outdir=structured_spark2k,
    depth=spark2k_settings['depth'],
    st=spark2k_settings['st'],
    rex=spark2k_settings['regex']
)

In [105]:
spark2k_named_regex = drain_parser_spark2k.generate_logformat_regex(drain_parser_spark2k.log_format)

In [106]:
spark2k_named_regex

(['Date', 'Time', 'Level', 'Component', 'Content'],
 regex.Regex('^(?P<Date>.*?)\\s+(?P<Time>.*?)\\s+(?P<Level>.*?)\\s+(?P<Component>.*?):\\s+(?P<Content>.*?)$', flags=regex.V0))

In [107]:
drain_parser_spark2k.parse(spark2k_log_name)

Parsing file: data\Spark\Spark_2k.log
Total lines:  2000
Processed 50.0% of log lines.
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.857543]


In [108]:
spark2k_df = drain_parser_spark2k.df_log.copy()

In [109]:
spark2k_df.head(3)

Unnamed: 0,LineId,Date,Time,Level,Component,Content,EventId,EventTemplate,ParameterList
0,1,17/06/09,20:10:40,INFO,executor.CoarseGrainedExecutorBackend,"Registered signal handlers for [TERM, HUP, INT]",932c104e,"Registered signal handlers for [TERM, HUP, INT]",[]
1,2,17/06/09,20:10:40,INFO,spark.SecurityManager,"Changing view acls to: yarn,curi",7bb9d001,"Changing <*> acls to: yarn,curi",[view]
2,3,17/06/09,20:10:40,INFO,spark.SecurityManager,"Changing modify acls to: yarn,curi",7bb9d001,"Changing <*> acls to: yarn,curi",[modify]


In [110]:
spark2k_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   LineId         2000 non-null   int64 
 1   Date           2000 non-null   object
 2   Time           2000 non-null   object
 3   Level          2000 non-null   object
 4   Component      2000 non-null   object
 5   Content        2000 non-null   object
 6   EventId        2000 non-null   object
 7   EventTemplate  2000 non-null   object
 8   ParameterList  2000 non-null   object
dtypes: int64(1), object(8)
memory usage: 140.8+ KB
