In [52]:
from sklearn.cluster import KMeans
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModel
from scipy.spatial.distance import cosine

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name) #used for answering
model_vec = AutoModel.from_pretrained(model_name) #used for vectorization
tokenizer = AutoTokenizer.from_pretrained(model_name) #used for tokenization

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
file_path = '../data/test_log1.out'

with open(file_path, 'r') as file:
    file_contents = file.read()
    file_lines = '\n'.join(file_contents.splitlines()[:1000])

In [107]:
file_lines

'Nov 09 13:11:35 localhost kernel: Linux version 5.15.73 (oe-user@oe-host) (x86_64-poky-linux-gcc (GCC) 11.3.0, GNU ld (GNU Binutils) 2.38.20220708) #1 SMP PREEMPT Sun May 21 21:05:48 UTC 2023\nNov 09 13:11:35 localhost kernel: Command line: BOOT_IMAGE=/boot/vmlinuz root=/dev/sda2 video=eDP-1:d ro\nNov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x001: \'x87 floating point registers\'\nNov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x002: \'SSE registers\'\nNov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x004: \'AVX registers\'\nNov 09 13:11:35 localhost kernel: x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256\nNov 09 13:11:35 localhost kernel: x86/fpu: Enabled xstate features 0x7, context size is 832 bytes, using \'compacted\' format.\nNov 09 13:11:35 localhost kernel: signal: max sigframe size: 1776\nNov 09 13:11:35 localhost kernel: BIOS-provided physical RAM map:\nNov 09 13:11:35 localhost kernel: BIOS-e820: [mem 0

In [76]:
def vectorize_text(model, tokenizer, input_string):
    # Encode the input string
    inputs = tokenizer.encode_plus(
        input_string,
        add_special_tokens=True,
        return_tensors="pt"
    )
    # Get the output from the model
    outputs = model(**inputs)
    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state
    # Average the embeddings
    vector = torch.mean(embeddings, dim=1)
    # Convert tensor to numpy array
    vector = vector.detach().numpy()
    return vector

def split_context(context, chunk_size):
    lines = context.splitlines()
    result = []
    for i in range(0, len(lines), chunk_size):
        chunk = lines[i:i + chunk_size]
        merged_text = '\n'.join(chunk)
        result.append(merged_text)
    return result

In [82]:
n_clusters = 5
kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)


In [108]:
chunks = split_context(file_lines, 5)
for c in chunks:
    print(c)

Nov 09 13:11:35 localhost kernel: Linux version 5.15.73 (oe-user@oe-host) (x86_64-poky-linux-gcc (GCC) 11.3.0, GNU ld (GNU Binutils) 2.38.20220708) #1 SMP PREEMPT Sun May 21 21:05:48 UTC 2023
Nov 09 13:11:35 localhost kernel: Command line: BOOT_IMAGE=/boot/vmlinuz root=/dev/sda2 video=eDP-1:d ro
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
Nov 09 13:11:35 localhost kernel: x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
Nov 09 13:11:35 localhost kernel: x86/fpu: Enabled xstate features 0x7, context size is 832 bytes, using 'compacted' format.
Nov 09 13:11:35 localhost kernel: signal: max sigframe size: 1776
Nov 09 13:11:35 localhost kernel: BIOS-provided physical RAM map:
Nov 09 13:11:35 localhost kernel: BIOS-e820: [mem 0x0000000000000000-

In [109]:
vectors = []
for chunk in chunks:
    vectors.extend(vectorize_text(model_vec, tokenizer, chunk))
    
vectors

[array([-1.1488774 , -0.6995511 , -0.60770726, ...,  1.4520946 ,
         0.09433654, -0.55343467], dtype=float32),
 array([-1.0694892 , -0.76223445, -0.6326316 , ...,  1.1964831 ,
         0.2531742 , -0.5356714 ], dtype=float32),
 array([-1.091119  , -0.80397326, -0.56228447, ...,  1.4139357 ,
         0.03244619, -0.56494284], dtype=float32),
 array([-1.1398526 , -0.75537086, -0.5347593 , ...,  1.4734216 ,
        -0.0201075 , -0.5352153 ], dtype=float32),
 array([-1.0747218 , -0.75101614, -0.46255717, ...,  1.536411  ,
         0.03686492, -0.5707837 ], dtype=float32),
 array([-1.0719237 , -0.79348934, -0.55027884, ...,  1.3632696 ,
         0.09928919, -0.5137384 ], dtype=float32),
 array([-1.0627084 , -0.7691336 , -0.5746762 , ...,  1.4027051 ,
         0.15665646, -0.5792319 ], dtype=float32),
 array([-1.0854797 , -0.7084779 , -0.71775156, ...,  1.3792801 ,
         0.27684945, -0.50059104], dtype=float32),
 array([-1.0440744 , -0.6916629 , -0.69430894, ...,  1.3327777 ,
       

In [110]:
y_kmeans = kmeans.fit_predict(vectors)

  super()._check_params_vs_input(X, default_n_init=10)


In [111]:
from scipy.spatial import distance
my_list=[]
for i in range(n_clusters):
    my_dict={}
    
    for j in range(len(y_kmeans)):
        
        if y_kmeans[j]==i:
            my_dict[j] =  distance.euclidean(kmeans.cluster_centers_[i],vectors[j])
    min_distance = min(my_dict.values())
    my_list.append(min(my_dict, key=my_dict.get))
 
                            
for i in sorted(my_list):
    print(chunks[i])

Nov 09 13:11:35 localhost kernel: Linux version 5.15.73 (oe-user@oe-host) (x86_64-poky-linux-gcc (GCC) 11.3.0, GNU ld (GNU Binutils) 2.38.20220708) #1 SMP PREEMPT Sun May 21 21:05:48 UTC 2023
Nov 09 13:11:35 localhost kernel: Command line: BOOT_IMAGE=/boot/vmlinuz root=/dev/sda2 video=eDP-1:d ro
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
Nov 09 13:11:35 localhost kernel: audit: initializing netlink subsys (disabled)
Nov 09 13:11:35 localhost kernel: audit: type=2000 audit(1699535493.145:1): state=initialized audit_enabled=0 res=1
Nov 09 13:11:35 localhost kernel: thermal_sys: Registered thermal governor 'step_wise'
Nov 09 13:11:35 localhost kernel: thermal_sys: Registered thermal governor 'user_space'
Nov 09 13:11:35 localhost kernel: cpuidle: u

In [118]:
summary = ""
for i in sorted(my_list):
    summary += chunks[i] + "\n"
print(summary)

Nov 09 13:11:35 localhost kernel: Linux version 5.15.73 (oe-user@oe-host) (x86_64-poky-linux-gcc (GCC) 11.3.0, GNU ld (GNU Binutils) 2.38.20220708) #1 SMP PREEMPT Sun May 21 21:05:48 UTC 2023
Nov 09 13:11:35 localhost kernel: Command line: BOOT_IMAGE=/boot/vmlinuz root=/dev/sda2 video=eDP-1:d ro
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
Nov 09 13:11:35 localhost kernel: x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
Nov 09 13:11:35 localhost kernel: audit: initializing netlink subsys (disabled)
Nov 09 13:11:35 localhost kernel: audit: type=2000 audit(1699535493.145:1): state=initialized audit_enabled=0 res=1
Nov 09 13:11:35 localhost kernel: thermal_sys: Registered thermal governor 'step_wise'
Nov 09 13:11:35 localhost kernel: thermal_sys: Registered thermal governor 'user_space'
Nov 09 13:11:35 localhost kernel: cpuidle: u

In [116]:
def answer_question(model, tokenizer, context, question):
    """
    The function answers questions given context and question.
    
    model: This is the model that you're using to generate answers to the questions. 
    It could be any model that's capable of question answering, such as a transformer model.

    param  tokenizer: This is the tokenizer that corresponds to your model. 
    It's used to convert your text data into a format that the model can understand.

    param context: This is the text that the model will look at to find an answer to the question.

    param question: This is the question that you're asking the model. 
    The model will generate an answer to this question based on the context.

    The function returns an answer to the question based on the context. 
    The answer is generated by finding the tokens with the highest start and end scores, 
    and joining them together. If the end score is higher than the start score, 
    they are swapped to ensure the answer makes sense.
    """
    # Encode the context and question
    encoded = tokenizer.encode_plus(question, context, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

    # Get the start and end scores for all tokens
    result = model(**encoded)
    start_scores = result["start_logits"]
    end_scores = result["end_logits"]

    # Find the tokens with the highest start and end scores
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # If the end score is higher than the start score, swap them
    if answer_end < answer_start:
        answer_start, answer_end = answer_end, answer_start

    # Get the tokens for the answer
    all_tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    answer = ' '.join(all_tokens[answer_start : answer_end+1])

    return answer

def calculate_similarity(question_vector, answer_vector):
    """Calculate the cosine similarity between the question and answer vectors.
    
    param question_vector: This is the vector representation of the question. 
    It's obtained by transforming the question text into numerical data that 
    can be processed by the machine learning model.

    param answer_vector: This is the vector representation of the answer. 
    It's obtained by transforming the answer text into numerical data that 
    can be processed by the machine learning model.

    The function calculates and returns the cosine similarity between the 
    question and answer vectors. Cosine similarity is a measure of similarity 
    between two non-zero vectors of an inner product space that measures the 
    cosine of the angle between them. The closer the cosine similarity to 1, 
    the more similar the question and answer are.
    """

    similarity = 1 - cosine(question_vector[0], answer_vector[0])

    return similarity


def find_best_answer(model, tokenizer, context, question, model_vec, num_answers=3, overlap=50, max_length=512):
    """Find the best answers to the question given a long context
    param model: This is the model that you're using to generate answers to the questions. 
    It could be any model that's capable of question answering, such as a transformer model.

    param tokenizer: This is the tokenizer that corresponds to your model.
    It's used to convert your text data into a format that the model can understand.

    param context: This is the text that the model will look at to find an answer to the question. 
    In this case, it's a long text that's split into chunks.

    param question: This is the question that you're asking the model. 
    The model will generate an answer to this question based on the context.

    param model_vec: This is a model used to vectorize the text, 
    i.e., convert the text into numerical data that can be processed by the machine learning model.

    param num_answers (default=3): This is the number of best answers the function will return.

    param overlap (default=50): This is the number of overlapping words between 
    two consecutive chunks when the context is split into chunks.

    param max_length (default=512): This is the maximum length of each chunk. 
    The context is split into chunks of this length.

    The function returns a list of tuples, where each tuple contains an answer 
    and its similarity score. The list is sorted in ascending order of similarity, 
    so the first element of the list is the answer with the lowest similarity, 
    and the last element is the answer with the highest similarity.
        
    """
    # Vectorize the question
    question_vector = vectorize_text(model_vec, tokenizer, question)
    
    # Initialize the best answers and their similarities to the question
    best_answers = [(None, -1) for _ in range(num_answers)]
    
    # Split the context into chunks
    chunks = split_context(context, 3)
    
    for chunk in chunks:
        answer = answer_question(model, tokenizer, chunk, question)
        if answer is not None:
            answer_vector = vectorize_text(model_vec, tokenizer, answer)
            if answer_vector is not None:
                similarity = calculate_similarity(question_vector, answer_vector)
                # Check if the similarity is higher than the current lowest in best_answers
                if similarity > best_answers[0][1]:
                    # Replace the lowest
                    best_answers[0] = (answer, similarity)
                    # Sort the list so the lowest similarity is first
                    best_answers = sorted(best_answers, key=lambda x: x[1])
    # Return the answers along with their similarities
    return best_answers

In [122]:
question = "Which thermal governor got registered?"

best_answer = find_best_answer(model, tokenizer, 
                               summary, question, 
                               model_vec, 
                               num_answers=3, 
                               overlap=100, 
                               max_length=tokenizer.model_max_length)
print(f"The best answers are: {best_answer}")

RuntimeError: The size of tensor a (592) must match the size of tensor b (512) at non-singleton dimension 1