In [2]:
import pandas as pd
import re


In [3]:
with open("../HDFS.log", "r", errors="ignore") as f:
    logs = pd.DataFrame(f.readlines(), columns=["raw_log"])

logs["raw_log"] = logs["raw_log"].str.strip()
logs.head()


Unnamed: 0,raw_log
0,081109 203518 143 INFO dfs.DataNode$DataXceive...
1,081109 203518 35 INFO dfs.FSNamesystem: BLOCK*...
2,081109 203519 143 INFO dfs.DataNode$DataXceive...
3,081109 203519 145 INFO dfs.DataNode$DataXceive...
4,081109 203519 145 INFO dfs.DataNode$PacketResp...


In [4]:
log_pattern = re.compile(
    r"^(\d{6})\s+(\d{6})\s+\d+\s+(INFO|WARN|ERROR)\s+([^\:]+):\s+(.*)$"
)

def parse_log(line):
    match = log_pattern.match(line)
    if match:
        date, time, level, component, message = match.groups()
        return date, time, level, component, message
    return None, None, None, None, None


In [5]:
sample_logs = logs.sample(200_000, random_state=42)

parsed = sample_logs["raw_log"].apply(
    lambda x: pd.Series(parse_log(x))
)

parsed.columns = ["date", "time", "level", "component", "message"]
parsed.head()


Unnamed: 0,date,time,level,component,message
4237366,81110,210118,INFO,dfs.FSNamesystem,BLOCK* NameSystem.delete: blk_-170857579237596...
7820301,81111,53144,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_-3552845605773...
7694398,81111,51549,INFO,dfs.FSNamesystem,BLOCK* NameSystem.addStoredBlock: blockMap upd...
536208,81109,220522,WARN,dfs.DataNode$DataXceiver,10.250.14.38:50010:Got exception while serving...
3596452,81110,132846,INFO,dfs.DataBlockScanner,Verification succeeded for blk_338667424734483...


In [6]:
parsed.dropna(inplace=True)
parsed.shape


(200000, 5)

In [7]:
parsed["timestamp"] = pd.to_datetime(
    parsed["date"] + parsed["time"],
    format="%y%m%d%H%M%S",
    errors="coerce"
)

parsed.dropna(subset=["timestamp"], inplace=True)
parsed[["timestamp", "level", "component"]].head()


Unnamed: 0,timestamp,level,component
4237366,2008-11-10 21:01:18,INFO,dfs.FSNamesystem
7820301,2008-11-11 05:31:44,INFO,dfs.DataNode$PacketResponder
7694398,2008-11-11 05:15:49,INFO,dfs.FSNamesystem
536208,2008-11-09 22:05:22,WARN,dfs.DataNode$DataXceiver
3596452,2008-11-10 13:28:46,INFO,dfs.DataBlockScanner


In [10]:
parsed.reset_index(drop=True, inplace=True)
parsed["level"].value_counts()
parsed.to_csv("../parsed_sample.csv", index=False)
