In [None]:
import sys
sys.path.append('../')
import os
import re
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from logparser import Spell, Drain

In [None]:
input_dir  = os.path.expanduser('~/.dataset/openstack/')
output_dir = '../output/openstack/'  
log_file   = "openstack_merged.log"  

In [None]:
log_structured_file = output_dir + log_file + "_structured.csv"
log_templates_file = output_dir + log_file + "_templates.csv"
log_sequence_file = output_dir + "openstack_sequence.csv"

In [None]:
log_directory = os.path.expanduser('~/.dataset/openstack/')
log_files = ["openstack_normal1.log", "openstack_abnormal.log", "openstack_normal2.log"]
merged_log_file = "openstack_merged.log"

In [None]:
def merge_log_files(log_directory, log_files, merged_log_file):
    merged_log_path = os.path.join(log_directory, merged_log_file)

    with open(merged_log_path, "w") as merged_log:
        for log_file in log_files:
            log_file_path = os.path.join(log_directory, log_file)
            if os.path.isfile(log_file_path):
                with open(log_file_path, "r") as log:
                    merged_log.write(log.read())
            else:
                print(f"Log file '{log_file}' not found.")
        print(f"Merged log files into: '{merged_log_file}'")


In [None]:
def mapping():
    log_temp = pd.read_csv(log_templates_file)
    log_temp.sort_values(by = ["Occurrences"], ascending=False, inplace=True)
    log_temp_dict = {event: idx+1 for idx , event in enumerate(list(log_temp["EventId"])) }
#    print(log_temp_dict)
    with open (output_dir + "openstack_log_templates.json", "w") as f:
        json.dump(log_temp_dict, f)

In [None]:
def parser(input_dir, output_dir, log_file, log_format, type='drain'):
    if type == 'spell':
        tau = 0.5  
        regex = [
            r"image [\da-f-]+",  
            r"(/[-\w]+)+"  
        ]  
        parser = Spell.LogParser(indir=input_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=False)
        parser.parse(log_file)

        
    elif type == 'drain':
        regex = [
            r"image [\da-f-]+",  
            r"(/[-\w]+)+"  
        ]

        
        st = 0.5  # Similarity threshold
        depth = 5  # Depth of all leaf nodes
        parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=False)
        parser.parse(log_file)

In [None]:
def openstack_sampling(log_file, window='session'):
    assert window == 'session', "Only window=session is supported for OPENSTACK dataset."
    print("Loading", log_file)
    df = pd.read_csv(log_file, engine='c',
            na_filter=False, memory_map=True, dtype={'Date':object, "Time": object})
    with open(output_dir + "openstack_log_templates.json", "r") as f:
        event_num = json.load(f)
    df["EventId"] = df["EventId"].apply(lambda x: event_num.get(x, -1))
    data_dict = defaultdict(list)
    for idx, row in tqdm(df.iterrows()):
        instance_id_list = re.findall(r'instance: ([\w-]+)', row['Content'])
        instance_id_set = set(instance_id_list)
        for instance_id in instance_id_set:
            data_dict[instance_id].append(row["EventId"])
    data_df = pd.DataFrame(list(data_dict.items()), columns=['instance_id', 'EventSequence'])
    data_df.to_csv(log_sequence_file, index=None)
    print("openstack sampling done")

In [None]:
if __name__ == "__main__":
    merge_log_files(log_directory, log_files, merged_log_file)
    # OPENSTACK log
    log_format = '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>'  # OPENSTACK log format
    parser(input_dir, output_dir, log_file, log_format, 'drain')
    mapping()
    openstack_sampling(log_structured_file)