In [42]:
import os

# Get the current working directory
current_directory = os.getcwd()

print("Current Working Directory:", current_directory)

with open('../data/test_log1.out', 'r') as file:
    context = file.read()

print(len(context))
contextLines = context.splitlines()

contextLines = contextLines[2000:3000]
print(len(contextLines))
print(contextLines[0])

Current Working Directory: p:\Dev\hackaTum\loganalysis\tutorial
3642936
1000
Nov 09 13:11:41 CMX50070-101776 health_service[555]: removing /var/log/coredump/core.CMXmarsServer.1000.b98ef10ab3f8471aa548add9b9e5d223.86893.1699426915000000.lz4


In [35]:
import numpy as np
from scipy.spatial.distance import cosine
import torch

In [37]:
def split_context(context, overlap=50, max_length=512):
    """
    Function for splitting context into overlapping chunks.
    
    param context: This is the text that you want to split into chunks. 
    The function will split this text based on the max_length and overlap parameters.

    param overlap (default=50): This is the number of characters that will overlap between each chunk. 
    This is used to ensure that the context is not cut off in the middle of a sentence, which could make the text difficult to understand.

    param max_length (default=512): This is the maximum length of each chunk. 
    The function will split the context into chunks of this length, with the exception of the last chunk, which may be shorter.

    The function returns a list of chunks, where each chunk is a string of text from the context. 
    The chunks are created by starting at the beginning of the context and moving forward max_length
    characters at a time, with an overlap of overlap characters between each chunk.
    """
    
    chunks = []
    start = 0
    maxChunkLength = 512
    maxLineLength = max(contextLines, key=len)
    shortLines = list(filter(lambda line: len(line) <= maxChunkLength, contextLines))
    print("lines with less than 513 chars: ", len(list(shortLines)))
    print("maxLineLength: " , (maxLineLength))

    while start < len(shortLines):
        chunk = ""
        
        # you need the -2 here, because 'n\' gets added to the chunk as well'
        while len(chunk) <= maxChunkLength-2 and start < len(shortLines):
            if(len(chunk) + len(shortLines[start])) <= maxChunkLength-2:
                chunk += shortLines[start]
                chunk += '\n'
                start += 1
            else:
                break
        chunks.append(chunk)
    return chunks

'\n    while start < len(context):\n        end = min(start + max_length, len(context))\n        chunks.append(context[start:end])\n        if end == len(context):\n            break\n        start = end - overlap\n    return chunks\n'

In [38]:
def filter_error_chunks(chunks):
    keywords = ["error", "failure", "warning", "not found", "missing", "problem"]
    return [chunk for chunk in chunks if any(keyword in chunk for keyword in keywords)]

In [43]:
splitContext = split_context(context)
print("lenSplit: ", len(splitContext[0]))
print(len(splitContext))

filteredContext = filter_error_chunks(splitContext)
print("filteredContext length: " , len(filteredContext))
for i in range(len(filteredContext)):
    print(filteredContext[i])
    print(10*'-')

linesUnder513 1000
maxLineLength:  Nov 09 13:11:48 CMX50070-101776 rs_callysto.xlapi_nb_container_settings[3036]: [info     ] Write .env file for XlapiContainerSettings(ip='*', port=7777, base_url='/notebooks', token='instrument', xlapi_instrument_address='localhost') to /run/user/0/mrt.callysto/.env.prod context=_setup | start pathname=/usr/lib/callysto/venv/lib/python3.10/site-packages/rs_callysto/xlapi_nb_container_settings.py lineno=46
lenSplit:  492
301
filteredContext length:  13
Nov 09 13:11:42 CMX50070-101776 systemd[1859]: mrt.base.service: Executable /usr/local/LoggingService/bin/stopcurrentlogger.sh missing, skipping: Permission denied
Nov 09 13:11:42 CMX50070-101776 systemd[1]: Started MRT Base Software.
Nov 09 13:11:42 CMX50070-101776 systemd[1]: Started Battery Measurement Service.
Nov 09 13:11:42 CMX50070-101776 systemd[1]: Starting MRT Deployment Calculation Service...

----------
Nov 09 13:11:42 CMX50070-101776 Xserver[1884]:         Before reporting problems, check ht

In [54]:
def answer_question(model, tokenizer, context, question):
    """
    The function answers questions given context and question.
    
    model: This is the model that you're using to generate answers to the questions. 
    It could be any model that's capable of question answering, such as a transformer model.

    param  tokenizer: This is the tokenizer that corresponds to your model. 
    It's used to convert your text data into a format that the model can understand.

    param context: This is the text that the model will look at to find an answer to the question.

    param question: This is the question that you're asking the model. 
    The model will generate an answer to this question based on the context.

    The function returns an answer to the question based on the context. 
    The answer is generated by finding the tokens with the highest start and end scores, 
    and joining them together. If the end score is higher than the start score, 
    they are swapped to ensure the answer makes sense.
    """
    # Encode the context and question
    encoded = tokenizer.encode_plus(question, context, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

    # Get the start and end scores for all tokens
    result = model(**encoded)
    start_scores = result["start_logits"]
    end_scores = result["end_logits"]

    # Find the tokens with the highest start and end scores
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # If the end score is higher than the start score, swap them
    if answer_end < answer_start:
        answer_start, answer_end = answer_end, answer_start

    # Get the tokens for the answer
    all_tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    answer = ' '.join(all_tokens[answer_start : answer_end+1])

    return answer


def vectorize_text(model, tokenizer, input_string):
    """
    Vectorize a given input string.
    
    param model: This is the model used to encode the input string and get the output. 
    It could be any model that's capable of encoding text, such as a transformer model.

    param tokenizer: This is the tokenizer that corresponds to your model. 
    It's used to convert your text data into a format that the model can understand.

    param input_string: This is the text that you want to vectorize. 
    The function will convert this text into a numerical representation that 
    can be processed by the machine learning model.

    The function returns a vector representation of the input string. 
    This vector is obtained by averaging the embeddings from the last hidden 
    state of the model's output.
    """
    # Encode the input string
    inputs = tokenizer.encode_plus(
        input_string,
        add_special_tokens=True,
        return_tensors="pt",
        max_length=512
    )

    # Get the output from the model
    outputs = model(**inputs)

    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state

    # Average the embeddings
    vector = torch.mean(embeddings, dim=1)

    # Convert tensor to numpy array
    vector = vector.detach().numpy()

    return vector


def calculate_similarity(question_vector, answer_vector):
    """Calculate the cosine similarity between the question and answer vectors.
    
    param question_vector: This is the vector representation of the question. 
    It's obtained by transforming the question text into numerical data that 
    can be processed by the machine learning model.

    param answer_vector: This is the vector representation of the answer. 
    It's obtained by transforming the answer text into numerical data that 
    can be processed by the machine learning model.

    The function calculates and returns the cosine similarity between the 
    question and answer vectors. Cosine similarity is a measure of similarity 
    between two non-zero vectors of an inner product space that measures the 
    cosine of the angle between them. The closer the cosine similarity to 1, 
    the more similar the question and answer are.
    """

    similarity = 1 - cosine(question_vector[0], answer_vector[0])

    return similarity


def find_best_answer(model, tokenizer, context, question, model_vec, num_answers=3, overlap=50, max_length=512):
    """Find the best answers to the question given a long context
    param model: This is the model that you're using to generate answers to the questions. 
    It could be any model that's capable of question answering, such as a transformer model.

    param tokenizer: This is the tokenizer that corresponds to your model.
    It's used to convert your text data into a format that the model can understand.

    param context: This is the text that the model will look at to find an answer to the question. 
    In this case, it's a long text that's split into chunks.

    param question: This is the question that you're asking the model. 
    The model will generate an answer to this question based on the context.

    param model_vec: This is a model used to vectorize the text, 
    i.e., convert the text into numerical data that can be processed by the machine learning model.

    param num_answers (default=3): This is the number of best answers the function will return.

    param overlap (default=50): This is the number of overlapping words between 
    two consecutive chunks when the context is split into chunks.

    param max_length (default=512): This is the maximum length of each chunk. 
    The context is split into chunks of this length.

    The function returns a list of tuples, where each tuple contains an answer 
    and its similarity score. The list is sorted in ascending order of similarity, 
    so the first element of the list is the answer with the lowest similarity, 
    and the last element is the answer with the highest similarity.
        
    """
    # Vectorize the question
    question_vector = vectorize_text(model_vec, tokenizer, question)
    
    # Initialize the best answers and their similarities to the question
    best_answers = [(None, -1) for _ in range(num_answers)]
    
    # Split the context into chunks
    chunks = split_context(context, overlap, max_length)

    filteredChunks = filter_error_chunks(chunks)
    print("chunks info")
    print("lenght: ", len(chunks))
    print("max chunk length: " , len(max(chunks, key=len)))
    print(100*'-')
    print("filtered chunks info")
    print("lenght: ", len(filteredChunks))
    print("max chunk length: " , len(max(filteredChunks, key=len)))
    
    
    # Generate an answer for each chunk and update the best answers if necessary
    for chunk in filteredChunks:
        print("current chunk: ")
        print(chunk)
        print(100*'-')
        answer = answer_question(model, tokenizer, chunk, question)
        if answer is not None:
            answer_vector = vectorize_text(model_vec, tokenizer, answer)
            if answer_vector is not None:
                similarity = calculate_similarity(question_vector, answer_vector)
                # Check if the similarity is higher than the current lowest in best_answers
                if similarity > best_answers[0][1]:
                    # Replace the lowest
                    best_answers[0] = (answer, similarity)
                    # Sort the list so the lowest similarity is first
                    best_answers = sorted(best_answers, key=lambda x: x[1])
    # Return the answers along with their similarities
    return best_answers

In [45]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModel

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name) #used for answering
model_vec = AutoModel.from_pretrained(model_name) #used for vectorization
tokenizer = AutoTokenizer.from_pretrained(model_name) #used for tokenization

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [55]:
#question = "What error message is displayed?"
question = "What is causing the error?"

# Find the best answer to the question given the context
best_answer = find_best_answer(model, tokenizer, contextLines, question, model_vec, num_answers=3, overlap=50, max_length=tokenizer.model_max_length)
print(f"The best answer is: {best_answer}")

linesUnder513 1000
maxLineLength:  Nov 09 13:11:48 CMX50070-101776 rs_callysto.xlapi_nb_container_settings[3036]: [info     ] Write .env file for XlapiContainerSettings(ip='*', port=7777, base_url='/notebooks', token='instrument', xlapi_instrument_address='localhost') to /run/user/0/mrt.callysto/.env.prod context=_setup | start pathname=/usr/lib/callysto/venv/lib/python3.10/site-packages/rs_callysto/xlapi_nb_container_settings.py lineno=46
chunks info
lenght:  301
max chunk length:  511
----------------------------------------------------------------------------------------------------
filtered chunks info
lenght:  13
max chunk length:  508
current chunk: 
Nov 09 13:11:42 CMX50070-101776 systemd[1859]: mrt.base.service: Executable /usr/local/LoggingService/bin/stopcurrentlogger.sh missing, skipping: Permission denied
Nov 09 13:11:42 CMX50070-101776 systemd[1]: Started MRT Base Software.
Nov 09 13:11:42 CMX50070-101776 systemd[1]: Started Battery Measurement Service.
Nov 09 13:11:42 CMX