### 1. Checklist

###### Checklist for logs: 

|job ID|log| reason acc. to elasticsearch | reason acc to script| |
|-|----------|----------|----------|-----|
| baf77c21-bdd5-0aa2-8731-000000000385|9ef7ce17ae108085e8a26e1f2046e1cf73e8c976.json|Infrastructure - Matlab License |Infrastructure - Matlab License |y|
| 86623622-6e74-9368-74a8-000000000028|882dafcc748b4f0695ce486241a05ad2.json|?| Test(s) failed|y|
|16911dd9-d58b-df18-7329-000000000ba3|e850514f59cb3334754f2641fc63f20f46af56e7.json|FuSa Violation found|Main Category: FuSa Violation Found (Check 'bazel_wrapper_log.txt' on the logs folder), Subcategory: , Pattern: Failed actions detected in bazel_wrapper_log.txt file|y|
|No ID provided||?|redirected output - Fusa Small Checks - Check-Bazel-rules-in-FuSa-critical-components, Subcategory: Fusa Small Checks - Check-Bazel-rules-in-FuSa-critical-components|?|
|No ID provided||?| redirected output - default, Subcategory: default|?|
|c65a4cd1-edf7-790d-cd51-000000000042|0752ac127d21a70af1765ae29afc53d978224f25.json|python|Test(s) failed, Subcategory: |?|
|No ID provided||python|Main Category: python, Subcategory: ,|y||
|b6d2a4bc-700f-7546-f838-000000000042|8e89be688f5b331654a6f44975a7c6c23dd3b71c.json|Compiler (gnu)  | Bazel - error executing command, Subcategory: error executing command|X|
||00e0b3d030f7fd0eef0c4b016e3d5e7e2b51a69f.json|Bazel (failed on target analysis)|Not flagged / No match found|X|
|No ID provided|b819bae8dfb1923b8f9dc3c1b4bb2eed8c3fa2a3.json|Compiler (clang)|Bazel - error executing command, Subcategory: error executing command|X|
|NO ID provided|cd5c0bbd5821d9d31f41442f78bcae86221e65a0.json|Bazel(missing input file)|Bazel - missing input file, Subcategory: missing input file, Pattern: ERROR:(.*)input file\(s\) do not exist |?|
|NO ID provided||Bazel(missing input file)|Bazel - missing input file, Subcategory: missing input file, Pattern: ERROR:(.*)missing input file '.*' |?|


### 2. Code

In [13]:
import json
import re
import os
import textwrap
import extract_build_failures.error_patterns as error_patterns

INFRA_PATTERNS = error_patterns.INFRA_PATTERNS
BUILD_PATTERNS = error_patterns.BUILD_PATTERNS

def restructure_patterns(patterns):
    restructured_patterns = {}
    for pattern_type, subpatterns in patterns.items():
        if isinstance(subpatterns, (set, list)):
            restructured_patterns[pattern_type] = {"": list(subpatterns)}
        elif isinstance(subpatterns, dict):
            restructured_patterns[pattern_type] = {}
            for subtype, regex_list in subpatterns.items():
                if isinstance(regex_list, (list, set)):
                    restructured_patterns[pattern_type][subtype] = list(regex_list)
                else:
                    raise ValueError(f"Unexpected type for regex list: {type(regex_list)}")
        else:
            raise ValueError(f"Unexpected type for subpatterns: {type(subpatterns)}")
    return restructured_patterns

def compile_patterns(patterns_dict):
    compiled_patterns = {}
    for main_category, subpatterns in patterns_dict.items():
        for sub_category, patterns in subpatterns.items():
            compiled_patterns.setdefault((main_category, sub_category), []).extend([re.compile(pattern) for pattern in patterns])
    return compiled_patterns

def check_log_entry(log_entry, compiled_patterns):
    matches = []
    for (main_category, sub_category), regex_list in compiled_patterns.items():
        for regex in regex_list:
            for match in regex.finditer(log_entry):
                category = f"{main_category} - {sub_category}" if sub_category else main_category
                matches.append((category, sub_category, regex.pattern))
    return matches

def process_single_log_file(file_path, compiled_patterns):
    summary = {'tasks_summary': {}}
    task_matches = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        log_entries = json.load(file)
        for log in log_entries:
            task_id = log.get('id', 'No ID provided')
            name = log.get('name', 'No task name provided')
            task_key = f"{name} (ID: {task_id})"
            summary['tasks_summary'].setdefault(task_key, 0)
            summary['tasks_summary'][task_key] += 1

            stdout_text = "\n".join(log.get('stdout_lines', []))
            matches = check_log_entry(stdout_text, compiled_patterns)
            if matches:
                if task_key not in task_matches:
                    task_matches[task_key] = []
                task_matches[task_key].extend(matches)

    return summary, task_matches

def process_log_files(directory_path, compiled_patterns):
    final_summary = {'tasks_summary': {}}
    file_matches = {}

    for filename in filter(lambda f: f.endswith('.json'), os.listdir(directory_path)):
        file_path = os.path.join(directory_path, filename)
        summary, task_matches = process_single_log_file(file_path, compiled_patterns)

        for task_key, count in summary['tasks_summary'].items():
            final_summary['tasks_summary'].setdefault(task_key, 0)
            final_summary['tasks_summary'][task_key] += count

        if task_matches:
            file_matches[file_path] = task_matches
        else:
            print(f"No matches found in file: {file_path}")  # Debugging output

    return final_summary, file_matches

def format_summary_to_screen_width(summary, terminal_width=150):
    formatted_summary = ""
    for key, value in summary.items():
        if isinstance(value, dict):
            formatted_summary += f"{key}:\n"
            for sub_key, sub_value in value.items():
                wrapped_sub_value = textwrap.fill(str(sub_value), terminal_width - 4)
                formatted_summary += f"  {sub_key}: {wrapped_sub_value}\n"
        else:
            wrapped_value = textwrap.fill(str(value), terminal_width)
            formatted_summary += f"{key}: {wrapped_value}\n"
        formatted_summary += "-" * terminal_width + "\n"
    return formatted_summary

def format_file_matches(file_matches, terminal_width=150):
    formatted_matches = ""
    for file_path, tasks in file_matches.items():
        formatted_matches += f"File: {file_path}\n"
        for task_key, matches in tasks.items():
            formatted_matches += f"  Task: {task_key}\n"
            unique_clusters = set()
            for match in matches:
                error_cluster, error_type, pattern = match
                if error_cluster not in unique_clusters:
                    unique_clusters.add(error_cluster)
                    formatted_matches += f"    Error Cluster: {error_cluster}, Error Type: {error_type}, Pattern: {pattern}\n"
        formatted_matches += "-" * terminal_width + "\n"
    return formatted_matches

def main():
    # Restructure the patterns first
    restructured_infra_patterns = restructure_patterns(INFRA_PATTERNS)
    restructured_build_patterns = restructure_patterns(BUILD_PATTERNS)

    # Compile the restructured patterns
    compiled_infra_patterns = compile_patterns(restructured_infra_patterns)
    compiled_build_patterns = compile_patterns(restructured_build_patterns)

    # Combine all compiled patterns
    all_compiled_patterns = {**compiled_infra_patterns, **compiled_build_patterns}

    # Process log files
    directory_path = 'preprocessed_logs'
    summary, file_matches = process_log_files(directory_path, all_compiled_patterns)

    # Format and print the file matches
    formatted_file_matches = format_file_matches(file_matches)
    print(formatted_file_matches)

main()

No matches found in file: preprocessed_logs\00628c96780941e8f1ffd78804d7da1e7bcf32417fbf3b476b1ea877f4ab44e4__job-output_cropped.json
No matches found in file: preprocessed_logs\0062e5a17f0c5832bf20d4f810fd448701afd48426f87de5a448d6bc198abae2__job-output_cropped.json
No matches found in file: preprocessed_logs\00708c33771b342dab6bc70612ebbda2d80d22abae3d95ff96ac10156500f72b__job-output_cropped.json
No matches found in file: preprocessed_logs\00e0b3d030f7fd0eef0c4b016e3d5e7e2b51a69f_cropped.json
File: preprocessed_logs\005fc583d4745e2470010c57b6422bcbc5c5fb0e271a7f11d72248e702e1090a__job-output_cropped.json
  Task: Run docker command (ID: 2aa698b8-b2fd-a900-8da6-000000000038)
    Error Cluster: Test(s) failed, Error Type: , Pattern: \/\/.*\s+FAILED in [0-9.]+s
------------------------------------------------------------------------------------------------------------------------------------------------------
File: preprocessed_logs\00609ed82bfdb509cf935a6101a636b891e25ef1a08268849b23660