## Dataset Selection

In [78]:
from data_utils import load_dataset, dataset
from pathlib import Path
import pandas as pd

# All subdir in logs directory
logs_dir = Path('logs')
logs_subdir = [x.name for x in logs_dir.iterdir() if x.is_dir()]

result = [ (subdir, len(load_dataset(subdir)['template'].unique())) for subdir in logs_subdir ]

# sort by number of templates
result.sort(key=lambda x: x[1], reverse=False)
eval_list = [ r[0] for r in result[:4]]
eval_dataset = [ dataset(proj)[['template', 'logs']] for proj in eval_list ]

In [79]:
for i in range(len(eval_list)):
    eval_dataset[i]['dataset'] = eval_list[i]
eval_dataset = pd.concat(eval_dataset)
eval_dataset['len_logs'] = eval_dataset['logs'].apply(lambda x: len(x))
eval_dataset['len_uniq'] = eval_dataset['logs'].apply(lambda x: len(set(x)))

In [83]:
import json

def result_of(project:str):
    with open(f'results/result_{project}.json', 'r') as result_file:
        return json.load(result_file)

parsed_results = [ result_of(project) for project in eval_list ]
parsed = []
for result in parsed_results:
    parsed.append([ [template, list(result[template]['variables'].keys()), result[template]['matches']] for template in result ])
parsed
parsed_dfs = [ pd.DataFrame(proj, columns=['template', 'variables', 'matches']) for proj in parsed ]

for i in range(len(eval_list)):
    parsed_dfs[i]['dataset'] = eval_list[i]
parsed_dfs = pd.concat(parsed_dfs)
parsed_dfs['len_matches'] = parsed_dfs['matches'].apply(lambda x: len(x))
parsed_dfs


Unnamed: 0,template,variables,matches,dataset,len_matches
0,jk2_init() Found child {child_pid} in scoreboa...,"[child_pid, slot_num]",[jk2_init() Found child 2006 in scoreboard slo...,Apache,835
1,jk2_init() Can't find child {child_pid} in sco...,[child_pid],[jk2_init() Can't find child 5054 in scoreboar...,Apache,12
2,[client {ip_address}] Directory index forbidde...,"[ip_address, directory]",[[client 65.68.235.27] Directory index forbidd...,Apache,32
3,mod_jk {worker_id} workerEnv in error state {e...,"[worker_id, error_state]","[mod_jk child workerEnv in error state 10, mod...",Apache,5
4,workerEnv.init() ok {path},[path],[workerEnv.init() ok /etc/httpd/conf/workers2....,Apache,1
5,mod_jk child init {child_id} {return_code},"[child_id, return_code]",[mod_jk child init 1 -2],Apache,1
0,"{host}:{host_port} close, {sent_bytes} bytes (...","[host, host_port, sent_bytes, received_bytes, ...","[play.google.com:443 close, 2130 bytes (2.08 K...",Proxifier,116
1,"{host}:{port} close, {sent_bytes} bytes sent, ...","[host, port, sent_bytes, received_bytes, lifet...","[blog.csdn.net:80 close, 670 bytes sent, 421 b...",Proxifier,216
2,"{host}:{host_port} close, {sent_bytes} bytes (...","[host, host_port, sent_bytes, sent_kb, receive...","[clients4.google.com:443 close, 11736 bytes (1...",Proxifier,298
3,{host}:{host_port} open through proxy {proxy}:...,"[host, host_port, proxy, proxy_port, protocol]",[csi.gstatic.com:443 open through proxy proxy....,Proxifier,215


In [84]:
# save to csv
eval_dataset.to_csv('results/eval_dataset.csv', index=False)
parsed_dfs.to_csv('results/parsed_dfs.csv', index=False)

In [87]:
print(eval_dataset.groupby('dataset').agg({'len_logs': 'sum', 'len_uniq': 'sum', 'template': 'count'}).reset_index())
print(parsed_dfs.groupby('dataset').agg({'len_matches': 'sum', 'template': 'count'}).reset_index())

     dataset  len_logs  len_uniq  template
0     Apache      2000       886         6
1       HDFS      2000      2000        14
2    OpenSSH      2000       729        26
3  Proxifier      2000      1056         8
     dataset  len_matches  template
0     Apache          886         6
1       HDFS         2204        17
2    OpenSSH          730        22
3  Proxifier         1305        10


In [88]:
from LoGPT import replace_variable
parsed_dfs['stared'] = parsed_dfs['template'].apply(lambda x: replace_variable(x))
parsed_dfs['stared']

0     jk2_init() Found child <*> in scoreboard slot <*>
1         jk2_init() Can't find child <*> in scoreboard
2     [client <*>] Directory index forbidden by rule...
3               mod_jk <*> workerEnv in error state <*>
4                               workerEnv.init() ok <*>
5                             mod_jk child init <*> <*>
0     <*>:<*> close, <*> bytes (<*> KB) sent, <*> by...
1     <*>:<*> close, <*> bytes sent, <*> bytes recei...
2     <*>:<*> close, <*> bytes (<*>) sent, <*> bytes...
3                <*>:<*> open through proxy <*>:<*> <*>
4     <*>:<*> error : Could not connect to proxy <*>...
5     <*>:<*> error : Could not connect through prox...
6     <*>:<*> error : Could not connect to proxy <*>...
7     <*>:<*> close, <*> bytes sent, <*> bytes recei...
8     <*>:<*> close, <*> bytes sent, <*> bytes (<*> ...
9                                   <*>:<*> error : <*>
0                     Received <*> of size <*> from <*>
1                Receiving block <*> src: <*> de

In [92]:
i = 0
print(eval_dataset[eval_dataset['dataset'] == eval_list[i]]['template'])
print(parsed_dfs[parsed_dfs['dataset'] == eval_list[i]]['stared'])

0    [client <*>] Directory index forbidden by rule...
1        jk2_init() Can't find child <*> in scoreboard
2    jk2_init() Found child <*> in scoreboard slot <*>
3                            mod_jk child init <*> <*>
4            mod_jk child workerEnv in error state <*>
5                              workerEnv.init() ok <*>
Name: template, dtype: object
0    jk2_init() Found child <*> in scoreboard slot <*>
1        jk2_init() Can't find child <*> in scoreboard
2    [client <*>] Directory index forbidden by rule...
3              mod_jk <*> workerEnv in error state <*>
4                              workerEnv.init() ok <*>
5                            mod_jk child init <*> <*>
Name: stared, dtype: object


## Notable Results

In [28]:
# Notable results
from LoGPT import LoGPT

def beautiful_print(texts):
    lines = texts.split('\n')
    code = """print(f'{text:-^{max_len}}')"""
    def execution(text):
        max_len = max([len(l) for l in lines])
        exec(code)
    return execution

def logpt_example(log, temparature=0.0):
    logpt = LoGPT(temparature=temparature)
    llm_output = logpt.llm_run(f"'{log}'")
    output = logpt.output_parse(llm_output)
    printer = beautiful_print('\n'.join([llm_output, output['template'], log]))
    printer('INPUT LOG')
    print(f'{log}')
    printer('LLM OUTPUT')
    print(llm_output)
    printer('PARSED TEMPLATE')
    print(f'{output["template"]}')

In [26]:
examples = [
    # HDFS
    "BLOCK* ask 10.251.126.5:50010 to delete  blk_-9016567407076718172 blk_-8695715290502978219 blk_-7168328752988473716 blk_-4355192005224403537 blk_-3757501769775889193 blk_-154600013573668394 blk_167132135416677587 blk_2654596473569751784 blk_5202581916713319258",
    "BLOCK* ask 10.250.17.177:50010 to delete  blk_-8570780307468499817 blk_-9122557405432088649 blk_-4393063808227796056 blk_8767569714374844347 blk_7079754042611867581 blk_7608961006114219538 blk_-5017273584996436939 blk_-6537833125980536955 blk_7610838808763810123 blk_3300803097775546532 blk_-5120750586032922592 blk_1577274266662884430 blk_765879159867598347 blk_-9076085976403711202 blk_-3198963348573340497 blk_-4645750029177277209 blk_-5136142986912961316 blk_5677959846373741243 blk_2107477892986152528 blk_-4235116161537008844 blk_6082535783543982566 blk_-4809870147222033236 blk_8818706925296961012 blk_-5203577173046267127 blk_189089569009261656 blk_446299976487589160 blk_-3916247521166632303 blk_-3324962406687427922 blk_-1807424528783081572 blk_-6858401049333055963 blk_6036564204960295926 blk_-8140723044408248078 blk_-3800132731140204959 blk_1716344083117307767 blk_-5194808114606613364 blk_-5473871016976323232 blk_2920934363167004552 blk_8736689095894369097 blk_-7642734632751940776 blk_3408482260833769309 blk_118013751374560901 blk_7963891081239759520 blk_3813114133944383323 blk_3042818489384932576 blk_-4570173726231458270 blk_-1564644006975920581 blk_338095650783321996 blk_3150135312641203550 blk_4285859645577726288 blk_3438772130782939627 blk_2634772258588877972 blk_-6795664812575964130 blk_3923069610304693233 blk_-1782996202120067721 blk_2004418049430157212 blk_1932147224007687756 blk_-582901062969027153 blk_5072240701440032119 blk_-7919006477393039068 blk_-7318022361288598312 blk_-6974693594143537436 blk_-5435767047126325206 blk_-5805500288959332434 blk_-7109885589081848850 blk_2161580591957523893 blk_7240227881194993860 blk_-8298405680648445349 blk_-4253026248821272215 blk_8377661448601579317 blk_8029153852899017155 blk_-8754388319080705916 blk_-7844092300527332901 blk_710178463364063355 blk_-5136849989188547884 blk_8393887138377503163 blk_-6950176077776664217 blk_-6488701068659548195 blk_2537458728254532453 blk_364441107933628577 blk_6207861897580168557 blk_8814943807366894581 blk_-4150682644311695471 blk_9174833667156726933 blk_649427218152856001 blk_-7403541028238011236 blk_-334982586592048773 blk_61908781908925992 blk_6385574357371832424 blk_-66376131060945541 blk_1372596948297458670 blk_-3389135155401857220 blk_-6035411221441929663 blk_-5127580069634421247 blk_-5685246533892022418 blk_4977937528993040451 blk_5680538862600094527 blk_-8378747462487962732 blk_425101290285860876 blk_6306622708327890839 blk_-1067866602168873257",
    # Proxifier
    "upload.3367.com:80 close, 2331 bytes (2.27 KB) sent, 440133 bytes (429 KB) received, lifetime 01:01",
    "play.google.com:443 close, 2130 bytes (2.08 KB) sent, 1009 bytes received, lifetime 03:24",
    # OpenSSH
    "PAM 1 more authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=185.190.58.151",
]
for e in examples:
    logpt_example(e)

-----------------------------------------------------------------------------------------------------------------------------INPUT LOG------------------------------------------------------------------------------------------------------------------------------
BLOCK* ask 10.251.126.5:50010 to delete  blk_-9016567407076718172 blk_-8695715290502978219 blk_-7168328752988473716 blk_-4355192005224403537 blk_-3757501769775889193 blk_-154600013573668394 blk_167132135416677587 blk_2654596473569751784 blk_5202581916713319258
-----------------------------------------------------------------------------------------------------------------------------LLM OUTPUT-----------------------------------------------------------------------------------------------------------------------------
block_ids = 'blk_-9016567407076718172 blk_-8695715290502978219 blk_-7168328752988473716 blk_-4355192005224403537 blk_-3757501769775889193 blk_-154600013573668394 blk_167132135416677587 blk_2654596473569751784 blk_5202

In [31]:
about_space = "message repeated 5 times: [ Failed password for root from 5.36.59.76 port 42393 ssh2]"
logpt_example(about_space, temparature=0.0)
logpt_example(about_space, temparature=0.8)

--------------------------------------INPUT LOG--------------------------------------
message repeated 5 times: [ Failed password for root from 5.36.59.76 port 42393 ssh2]
-------------------------------------LLM OUTPUT--------------------------------------
count = 5
message = 'Failed password for root from 5.36.59.76 port 42393 ssh2'
template = f'message repeated {count} times: [ {message} ]'
-----------------------------------PARSED TEMPLATE-----------------------------------
message repeated {count} times: [ {message} ]
--------------------------------------INPUT LOG--------------------------------------
message repeated 5 times: [ Failed password for root from 5.36.59.76 port 42393 ssh2]
-------------------------------------LLM OUTPUT--------------------------------------
count = 5
message = 'Failed password for root from 5.36.59.76 port 42393 ssh2'
template = f'message repeated {count} times: [ {message} ]'
-----------------------------------PARSED TEMPLATE------------------------

In [36]:
logpt_example(about_space, temparature=0.8)

-----------------------------------------------------INPUT LOG------------------------------------------------------
message repeated 5 times: [ Failed password for root from 5.36.59.76 port 42393 ssh2]
-----------------------------------------------------LLM OUTPUT-----------------------------------------------------
count = 5
user = 'root'
ip_address = '5.36.59.76'
port = '42393'
protocol = 'ssh2'
template = f'message repeated {count} times: [ Failed password for {user} from {ip_address} port {port} {protocol}]'
--------------------------------------------------PARSED TEMPLATE---------------------------------------------------
message repeated {count} times: [ Failed password for {user} from {ip_address} port {port} {protocol}]
