In [312]:
from datasets import load_dataset, load_from_disk
import wordninja
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"
import re
import glob
import numpy as np
import pandas as pd
import string

from tqdm.auto import tqdm

import datasets
import transformers
from transformers import AutoTokenizer, default_data_collator, get_scheduler
from hf_transformers.src.transformers.models.bert.configuration_bert import BertConfig
from hf_transformers.src.transformers.models.bert.modeling_bert import BertModel

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

from utils import preprocess_log_batch_hof

import sys
import math

import collections


from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


import openai

import fasttext

from rouge import Rouge 

# Create your views here.
openai.api_key='YOUR OPENAI API KEY'

In [2]:
#WARNING
#cacheディレクトリの指定は絶対マウント先のフォルダにするように
#そうしないとdockerイメージを管理している研究室サーバーの/がパンパンになってしまう
#/が容量オーバーすると何も動かなくなって他の人に迷惑
!export HF_DATASETS_CACHE="/home/"

In [43]:
# anomaly_indexes = df_hdfs[df_hdfs["Label"]=="Anomaly"].index
# window_size = 3
# anomaly_logs = {
#     "logs":[],
#     "BlockId":[]
# }
# i=0
# for idx in anomaly_indexes:
#     blockId = df_hdfs.iloc[idx, :]["BlockId"]
#     i+=1
#     anomaly_logs["logs"].append(" ".join(list(df_hdfs.iloc[idx-window_size:idx+window_size, :]["log"])))
#     anomaly_logs["BlockId"].append(blockId)
    
# df_anomaly_logs = pd.DataFrame(anomaly_logs)
# df_anomaly_logs = df_anomaly_logs[df_anomaly_logs["logs"] != ""]

# def get_blockId(line):
#     match = re.search(r"blk_[-]*[0-9]+", line)
#     if match is not None:
#         return match.group(0)
#     return ""

In [316]:
#unilog再現実装で事前学習済みのBertモデルを持ってきた
unilog_bert_model = BertModel.from_pretrained("./logdata/unilog_pretrain_mask15_without_numbers_2/")

#unilog再現実装で訓練したトークナイザ
log_tokenizer_w_n = AutoTokenizer.from_pretrained("./tokenizers/log_tokenizer_from_old_without_numbers/")

#unilog形式でファインチューニング済みのBERTでログをベクトルへエンコードする関数
def vectorize_using_unilog(text, unilog_model, tokenizer):
    sample = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = unilog_model(**sample)
    return torch.mean(output["last_hidden_state"], dim=1).squeeze().numpy()

#fasttextでログをベクトル化
def vectorize_log(target_model):
    def vectorize_log_helper(input_log):
        v = np.zeros(100, dtype=float)

        for w in input_log.split():
            v += target_model[w] 
        return v / len(input_log.split())
    return vectorize_log_helper

# データのロード

In [314]:
dataset_summary = load_dataset("csv", data_files="./logdata/log_summary_pairs.csv")
dataset_summary = dataset_summary.remove_columns(['Unnamed: 0'])
dataset_summary = dataset_summary["train"]#.train_test_split(0.2)
dataset_summary = dataset_summary.rename_column("input", "log")
dataset_summary = dataset_summary.map(
    preprocess_log_batch_hof(),
    batched=True,
)

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-00dcddb0e73b1160/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 1/1 [00:00<00:00, 513.38it/s]
                                                                   

In [317]:
df_summary = dataset_summary.to_pandas()
df_summary["unilog_vectors"] = df_summary["text"].map(lambda text: vectorize_using_unilog(text, unilog_bert_model, log_tokenizer_w_n))


In [401]:
df_summary["fasttext_vectors"] = df_summary["log"].map(vectorize_log(fasttext_model))

In [None]:
df_summary

In [123]:


#ref: https://necromuralist.github.io/Neurotic-Networking/posts/nlp/machine-translation-k-nearest-neighbors/index.html

def cosine_similarity(vector_1: np.ndarray, vector_2: np.ndarray) -> float:
    """Calculates the similarity between two vectors

    Args:
     vector_1: array to compare
     vector_2: array to compare to vector_1

    Returns:
     cosine similarity between the two vectors
    """
    return np.dot(vector_1, vector_2)/(np.linalg.norm(vector_1) *
                                          np.linalg.norm(vector_2))

#fasttextでベクトル化したログデータのうち,k近傍のベクトルを得ている
#なお, 論文(https://arxiv.org/abs/2305.15778)ではユークリッド距離と時系列を考慮しているが, 単純化のためここでは単なるコサイン類似度を利用
def nearest_neighbor(v, candidates, k=1):
    """
    Input:
      - v, the vector you are going find the nearest neighbor for
      - candidates: a set of vectors where we will find the neighbors
      - k: top k nearest neighbors to find
    Output:
      - k_idx: the indices of the top k closest vectors in sorted form
    """
    # cosine_similarities = [cosine_similarity(v, row) for row in candidates]

    # for each candidate vector...
    #for row in candidates:
    #    # get the cosine similarity
    #    cos_similarity = cosine_similarity(v, row)
    #
    #    # append the similarity to the list
    #    similarity_l.append(cos_similarity)

    # sort the similarity list and get the indices of the sorted list
    # sorted_ids = numpy.argsort(similarity_l)

    # get the indices of the k most similar candidate vectors
    # k_idx = sorted_ids[-k:]
    ### END CODE HERE ###
    return np.argsort([cosine_similarity(v, row) for row in candidates])[-k:]



In [398]:
#プロンプトの内容をGPTに入力し, 出力を得る関数
def ask_GPT(input_log, make_prompt, max_tokens=64):
    prompt = make_prompt(input_log)
    return GPT(prompt, max_tokens=max_tokens)
    
#プロンプト生成(最もベーシック, ただのfew-shot)
def make_prompt_1(input_log, num_samples=3):
    prompt = """Could you briefly summarize a log message like the following examples? Note: Please do NOT include any numbers and special characters in your answer."""

    samples = df_summary.sample(num_samples)

    i = 0
    for log, summary in zip(samples["log"], samples["summary"]):
        prompt += f"\nQ{i + 1}: {log}\nA{i + 1}: {summary}\n"
        i += 1
        
    prompt += f"\nQ{i + 1}:{input_log}\n"

    return prompt

#論文(https://arxiv.org/abs/2305.15778)の4.2.2に対応, まずは軽くログの内容を要約
def make_summarization_prompt(input_log):
    prompt = f"""
        {input_log}
        Please summarize the above input. Please note that the above input is incident diagnostic information. The summary results should be about 120 words, no more than 140 words, and should cover important information as much as possible. Just return the summary without any additional output.
    """

    return prompt

#chatGPTからプロンプト(user_query)対する答えを得る関数
def GPT(user_query, model_engine="text-davinci-003", max_tokens=64):
    '''
    This function uses the OpenAI API to generate a response to the given
    user_query using the ChatGPT model
    '''
    # Use the OpenAI API to generate a response
    completion = openai.Completion.create(
          engine = model_engine,
          prompt = user_query,
          max_tokens = max_tokens,
          n = 1,
          temperature = 1.0,
    )
    response = completion.choices[0].text
    return response

#論文(https://arxiv.org/abs/2305.15778)の4.2.4に該当あるように、まずは
def make_prompt_ms(model, log_vectors, is_unilog=False, tokenizer=None, verbose=False):
    def make_prompt_ms_helper(input_log, num_samples=5):
        #論文(https://arxiv.org/abs/2305.15778)の4.2.2の軽い要約を入手
        input_summary = ask_GPT(input_log, make_summarization_prompt, max_tokens=128)

        prompt = f"""Context: The following description shows the error
            log information of an incident. Please select the
            incident information that is most likely to have the
            same root cause and give your explanation (just
            give one answer). 
            Input: {input_summary}
            Options:
    """
        
        #If not, please select the first item “Unseen incident”.
            #A: Unseen incident.
        alphabet = list(string.ascii_uppercase)
        
        # 論文(https://arxiv.org/abs/2305.15778)の4.2.3に該当, 現在のログと近い他の例を用意して, 要約結果の選択肢を与える
        if is_unilog and tokenizer is not None:
            input_vec = vectorize_using_unilog(input_log, model, tokenizer)
        else:
            input_vec = vectorize_log(model)(input_log)
            
        neibors = nearest_neighbor(input_vec, log_vectors, k=num_samples)
        samples = df_summary.iloc[neibors, :]
        
        i = 0
        summary_set = set()
        for summary in samples["summary"]:
            if summary not in summary_set:
                summary_set.add(summary)
                option = alphabet[i]
                prompt += f"{option}: {summary}\n"
                i += 1
        if verbose:
            print("-"*40, "Prompt", "-"*40)
            print(prompt)
            print("-"*100)

        return prompt
    return make_prompt_ms_helper

In [395]:
#Rougeを計算する関数
#GPTの生成結果は余計な情報もかなり含まれるので, 正規表現で出力部分のみを取り出している
#もし正規表現にマッチするものが無かったら答えだけで比較
def calculate_rouge(df_result):
    """
    A fuction to calculate rouge score between the ground-truth summaries and GPT-generated ones. 
    It finally
    """
    rouge = Rouge()
    recalls = []
    precisions = []
    f_scores = []
    for ans, summary in zip(df_result["answers"], df_result["summary"]):
        m = re.search(r"(?<=Answer: [A-Z]:).*(?=[;\n])", ans)
        if m is None:
            m = re.search(r"(?<=[A-Z]:).*(?=\n)", ans)

        ans = m.group(0) if m is not None else ans
        rouge_score = rouge.get_scores(ans, summary)[0]
        recalls.append(rouge_score["rouge-1"]["r"])
        precisions.append(rouge_score["rouge-1"]["p"])
        f_scores.append(rouge_score["rouge-1"]["f"])
    avg_precision = sum(precisions)/len(df_result)
    avg_recall = sum(recalls)/len(df_result)
    avg_f_score = sum(f_scores)/len(df_result)
    print(f"Average precision {avg_precision}, recall {avg_recall}, f1 {avg_f_score}")
    return avg_precision, avg_recall, avg_f_score

In [325]:
def show_current_log(idx):
    for column in df_summary.columns:
        if "vector" not in column:
            print(f"{column}: ", df_summary.iloc[idx, :][column])

In [408]:
curr_log_vectors = np.array(df_summary["unilog_vectors"])
curr_log_vectors[0].shape

(128,)

In [409]:
IS_FASTTEXT = False

In [410]:
idx = np.random.randint(len(df_summary))
show_current_log(idx)
input_log = df_summary.iloc[idx, :]["log"]
sample_ans = df_summary.iloc[idx, :]["summary"]
if not IS_FASTTEXT:
    ans = ask_GPT(input_log, make_prompt_ms(unilog_bert_model, curr_log_vectors, is_unilog=True, tokenizer=log_tokenizer_w_n, verbose=True))
else:
    ans = ask_GPT(input_log, make_prompt_ms(fasttext_model, curr_log_vectors, verbose=True))
print(ans)

log:  RAS KERNEL INFO ciod : generated 128 core files for program /g/g90/glosli/src/ddcMD/ddcMD1.1.18a/bin/ddcMDbglV
summary:  generated core files for program;
text:  ras kernel info cio d generated core files for program g g g los li s rc dd cm d dd cm d a bin dd cm d bg lv
---------------------------------------- Prompt ----------------------------------------
Context: The following description shows the error
            log information of an incident. Please select the
            incident information that is most likely to have the
            same root cause and give your explanation (just
            give one answer). 
            Input: 
    The incident diagnostic information states that 128 core files have been generated for the program /g/g90/glosli/src/ddcMD/ddcMD1.1.18a/bin/ddcMDbglV. This implies that the program experienced a crash or hardware issue of some kind, but the specifics are unknown. The core files will assist in streamlining the process of debugging and troub

In [411]:
test_df_summary = pd.read_csv("./logdata/test_summary_0713.csv", index_col=0)
test_df_summary = df_summary.iloc[test_df_summary.index]

In [None]:
answers = []

for i in tqdm(range(len(test_df_summary))):
    sample_input = test_df_summary.iloc[i]["log"]
    ans = ask_GPT(sample_input, make_prompt_ms(unilog_bert_model, curr_log_vectors, is_unilog=True, tokenizer=log_tokenizer_w_n))
    answers.append(ans)

test_df_summary["answers"] = answers
calculate_rouge(test_df_summary)

 72%|███████▏  | 715/1000 [1:14:45<28:19,  5.96s/it]

In [370]:

print(f"Average precision {sum(precisions)/len(test_df_summary)}, recall {sum(recalls)/len(test_df_summary)}, f1 {sum(f_scores)/len(test_df_summary)}")

Average precision 0.520789428236848, recall 0.6399072331807628, f1 0.541242909419604


In [367]:
test_df_summary = pd.read_csv("./logdata/test_summary_0713.csv", index_col=0)


In [378]:
calculate_rouge(test_df_summary)

Average precision 0.520789428236848, recall 0.6399072331807628, f1 0.541242909419604


(0.520789428236848, 0.6399072331807628, 0.541242909419604)