In [2]:
import sys
sys.path.append('../')
import os
import re
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from logparser import Spell, Drain

In [3]:
input_dir  = os.path.expanduser('~/logbert_OPENSTACK/input')
print(input_dir)
output_dir = os.path.expanduser('~/logbert_OPENSTACK/output')  
print(output_dir)
log_file   = "openstack_merged.log"

/root/logbert_OPENSTACK/input
/root/logbert_OPENSTACK/output


In [4]:
log_structured_file = os.path.join(output_dir,log_file+ "_structured.csv")
log_templates_file = os.path.join(output_dir,log_file + "_templates.csv")
log_sequence_file = os.path.join(output_dir, "openstack_sequence.csv")

In [5]:
log_directory = input_dir
log_files = ["openstack_normal1.log", "openstack_abnormal.log", "openstack_normal2.log"]
merged_log_file = "openstack_merged.log"

In [6]:
def merge_log_files(input_dir,output_dir, log_files, merged_log_file):
    merged_log_path = os.path.join(output_dir, merged_log_file)

    with open(merged_log_path, "w") as merged_log:
        for log_file in log_files:
            log_file_path = os.path.join(input_dir, log_file)
            if os.path.isfile(log_file_path):
                with open(log_file_path, "r") as log:
                    merged_log.write(log.read())
            else:
                print(f"Log file '{log_file}' not found.")
        print(f"Merged log files into: '{merged_log_file}'")

In [7]:
        

def mapping(output_dir):
    log_temp = pd.read_csv(log_templates_file)
    log_temp.sort_values(by = ["Occurrences"], ascending=False, inplace=True)
    log_temp_dict = {event: idx+1 for idx , event in enumerate(list(log_temp["EventId"])) }
#    print(log_temp_dict)
    with open(os.path.join(output_dir, "openstack_log_templates.json"), "w") as f:
        json.dump(log_temp_dict, f)

In [8]:
        
        
def parser(input_dir, output_dir, log_file, log_format, type='drain'):
    if type == 'spell':
        tau = 0.5  
        regex = [
            r"image [\da-f-]+",  
            r"(/[-\w]+)+"  
        ]  
        parser = Spell.LogParser(indir=output_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=False)
        parser.parse(log_file)

        
    elif type == 'drain':
        regex = [
            r"image [\da-f-]+",  
            r"(/[-\w]+)+"  
        ]

        
        st = 0.5  # Similarity threshold
        depth = 5  # Depth of all leaf nodes
        parser = Drain.LogParser(log_format, indir=output_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=False)
        parser.parse(log_file)

In [9]:
        
        
def openstack_sampling(log_file, window='session'):
    assert window == 'session', "Only window=session is supported for OPENSTACK dataset."
    print("Loading", log_file)
    df = pd.read_csv(log_file, engine='c',
                     na_filter=False, memory_map=True, dtype={'Date': object, "Time": object})
    with open(os.path.join(output_dir, "openstack_log_templates.json"), "r") as f:
        event_num = json.load(f)
    df["EventId"] = df["EventId"].apply(lambda x: event_num.get(x, -1))
    data_dict = defaultdict(list)
    for idx, row in tqdm(df.iterrows()):
        instance_id_list = re.findall(r'instance: ([\w-]+)', row['Content'])
        instance_id_set = set(instance_id_list)
        for instance_id in instance_id_set:
            data_dict[instance_id].append(row["EventId"])
    data_df = pd.DataFrame(list(data_dict.items()), columns=['instance_id', 'EventSequence'])
    data_df.to_csv(log_sequence_file, index=None)
    print("openstack sampling done")

In [10]:
    
if __name__ == "__main__":
    merge_log_files(input_dir,output_dir, log_files, merged_log_file)
    # OPENSTACK log
    log_format = '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>'  # OPENSTACK log format
    parser(input_dir, output_dir, log_file, log_format, 'drain')
    mapping(output_dir)
    openstack_sampling(log_structured_file)

Merged log files into: 'openstack_merged.log'
Parsing file: /root/logbert_OPENSTACK/output/openstack_merged.log
Total size after encoding is 207636 207820
Parsing done. [Time taken: 0:00:39.739674]
Loading /root/logbert_OPENSTACK/output/openstack_merged.log_structured.csv


207636it [00:07, 26007.78it/s]


openstack sampling done
