In [None]:
### first version


import os
import re

def search_pattern_in_files(folder_path, pattern, start_pattern, end_pattern):
   results = []
   for root, dirs, files in os.walk(folder_path):
       for file in files:
           file_path = os.path.join(root, file)
           with open(file_path, 'r') as f:
               content = f.read()
               matches = re.finditer(pattern, content)
               if matches:
                   for match in matches:
                       start, end = match.start(), match.end()
                       matched_section = content[start:end]
                       start_point = re.findall(start_pattern, content[:start])
                       end_point = re.findall(end_pattern, content[end:])
                       if start_point and end_point:
                           start_point = start_point[-1]
                           end_point = end_point[0]
                           before_text = content[content.rfind(start_point, 0, start):start]
                           after_text = content[end:end + content.find(end_point, end)]
                           result = {
                               "file_path": file_path,
                               "start": start,
                               "end": end,
                               "matched_section": matched_section,
                               "before_text": before_text,
                               "after_text": after_text
                           }
                           results.append(result)
   return results

def preprocess_log_files(folder_path, pattern, start_pattern, end_pattern):
   results = search_pattern_in_files(folder_path, pattern, start_pattern, end_pattern)
   file_count = 0
   match_count = 0
   for match in results:
       file_path = match["file_path"]
       start = match["start"]
       end = match["end"]
       before_text = match["before_text"]

       new_file_path = os.path.join("preprocessed_logs", f"cropped_{os.path.basename(file_path)}_{start}-{end}")
       with open(new_file_path, 'w') as f:
           f.write(before_text)

       file_count += 1
       match_count += 1

   print(f"Total matches found: {match_count}")
   print(f"Total files created: {file_count}")

# Usage
source_folder_path = "logs"
pattern = r'"failed":\s*true'
start_pattern = r'"branch":\s*"master",\s*"index":'
end_pattern = r'"branch":\s*"master",\s*"index":'

preprocess_log_files(source_folder_path, pattern, start_pattern, end_pattern)

In [None]:
##Simple script to gather info ###

import os
import json

def get_all_keys(data, prefix=''):
    """
    Recursively collects all the keys in a JSON object.

    :param data: The JSON data (already loaded into a Python object).
    :param prefix: The prefix to append to the keys for nested dictionaries.
    :return: A set of all keys in the JSON object.
    """
    keys = set()

    if isinstance(data, dict):
        for k, v in data.items():
            full_key = f"{prefix}.{k}" if prefix else k
            keys.add(full_key)
            keys.update(get_all_keys(v, full_key))
    elif isinstance(data, list):
        for item in data:
            keys.update(get_all_keys(item, prefix))

    return keys


def get_keys_from_first_json_file_and_save(folder_path):
    """
    Finds the first JSON file in the given folder, retrieves all keys, and saves them to a file.

    :param folder_path: The path to the folder containing JSON files.
    """
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as json_file:
                data = json.load(json_file)
                all_keys = get_all_keys(data)
                keys_list = list(all_keys)
                
                # Save the keys to a file
                with open('json_keys.txt', 'w') as keys_file:
                    for key in keys_list:
                        keys_file.write(f"{key}\n")
                print(f"Keys saved to json_keys.txt")
                return  # Stop after processing the first JSON file

    print("No JSON files found in the folder.")

# Usage example:
# Replace 'source_folder_path' with the actual path to your folder containing JSON logs
source_folder_path = 'logs'
all_keys = get_keys_from_first_json_file_and_save(source_folder_path)
print(all_keys)

In [None]:
## ONe working Version, DO NOT TOUCH

import json
import os
import shutil

def clear_directory(directory_path):
    """
    Clears all the contents of the given directory.

    :param directory_path: The path to the directory to clear.
    """
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

def extract_error_info_from_file(json_file_path):
    """
    Reads a JSON file and extracts error information from the "plays" section, including all tasks where "failed" is true,
    and retrieves "stderr" and "stdout_lines" for the host's node.

    :param json_file_path: The file path to the JSON data file.
    :return: A list of error information dictionaries.
    """
    error_info_list = []

    try:
        with open(json_file_path, 'r') as file:
            data = json.load(file)

        # Check if the JSON data is a list
        if not isinstance(data, list):
            raise ValueError("JSON data must be a list.")

        # Iterate over each element in the list to find dictionaries with a 'plays' key
        for item in data:
            if isinstance(item, dict) and 'plays' in item:
                # Iterate over each play in the "plays" list
                for play in item['plays']:
                    # Assuming each play is a dictionary and contains a 'tasks' key
                    if 'tasks' in play:
                        for task in play['tasks']:
                            # Assuming each task is a dictionary and contains a 'hosts' key
                            if 'hosts' in task:
                                for node, host_info in task['hosts'].items():
                                    # Check if the task failed for the current host's node
                                    if host_info.get('failed'):
                                        error_info = {
                                            'task_name': task.get('name', 'Unnamed task'),
                                            'node': node,
                                            'stderr': host_info.get('stderr', ''),
                                            'stdout_lines': host_info.get('stdout_lines', [])
                                        }
                                        error_info_list.append(error_info)
    except json.JSONDecodeError as e:
        print(f"An error occurred while decoding JSON from {json_file_path}: {e}")
    except FileNotFoundError:
        print(f"The file {json_file_path} was not found.")
    except IOError as e:
        print(f"An I/O error occurred while handling {json_file_path}: {e}")
    except ValueError as e:
        print(f"An error occurred while processing {json_file_path}: {e}")

    return error_info_list

def extract_and_save_error_info(directory_path, output_directory):
    """
    Extracts error information from all JSON files in the given directory and saves it to new files in the output directory.

    :param directory_path: The path to the directory containing JSON files.
    :param output_directory: The path to the directory where the error information files will be saved.
    """
    # Check if the output directory exists, create it if it doesn't, or clear it if it does
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    else:
        clear_directory(output_directory)

    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            try:
                # Extract error info from each JSON file
                error_info = extract_error_info_from_file(file_path)
                if error_info:  # If there is error info, save it to a new file
                    new_filename = os.path.splitext(filename)[0] + '_cropped.json'
                    new_file_path = os.path.join(output_directory, new_filename)
                    with open(new_file_path, 'w') as new_file:
                        json.dump(error_info, new_file, indent=4)
            except Exception as e:
                print(f"An error occurred while processing {filename}: {e}")



# Example usage:
logs_directory_path = 'logs'
output_directory_path = 'preprocessed_logs'

extract_and_save_error_info(logs_directory_path, output_directory_path)

In [None]:
# Working pattern matching 

# Function to compile the patterns from error_patterns.py
import json
import error_patterns
import re
import os
INFRA_PATTERNS = error_patterns.INFRA_PATTERNS
BUILD_PATTERNS = error_patterns.BUILD_PATTERNS

def compile_patterns(patterns_dict):
    compiled_patterns = {}
    for main_category, patterns in patterns_dict.items():
        if isinstance(patterns, dict):
            for sub_category, sub_patterns in patterns.items():
                for pattern in sub_patterns:
                    compiled_patterns.setdefault((main_category, sub_category), []).append(re.compile(pattern))
        else:
            for pattern in patterns:
                compiled_patterns.setdefault((main_category, ""), []).append(re.compile(pattern))
    return compiled_patterns

# Compile the patterns
compiled_infra_patterns = compile_patterns(INFRA_PATTERNS)
compiled_build_patterns = compile_patterns(BUILD_PATTERNS)

# print(compiled_infra_patterns)
# print(compiled_build_patterns)

#### Working Code, Type + subtype, INFRA_PATTERN, file: ef7ce17ae108085e8a26e1f2046e1cf73e8c976_cropped.json

# Function to check a log entry against compiled patterns
def check_log_entry(log_entry, compiled_patterns):
    matches = []
    for (main_category, sub_category), regex_list in compiled_patterns.items():
        for regex in regex_list:
            if regex.search(log_entry):
                # If a subcategory exists, include it in the output
                category = f"{main_category} - {sub_category}" if sub_category else main_category
                # Append a tuple with the category, subcategory, and the matching pattern
                matches.append((category, sub_category, regex.pattern))
    return matches

# Function to process a log file
def process_log_file(file_path, compiled_patterns):
    with open(file_path, 'r', encoding='utf-8') as file:
        log_entries = json.load(file)
        
        # Iterate over the log entries
        for entry in log_entries:
            # Concatenate the stdout_lines if present into a single string
            stdout_text = "\n".join(entry['stdout_lines']) if 'stdout_lines' in entry else ""
            # Check the concatenated stdout_lines against the compiled regex patterns
            matches = check_log_entry(stdout_text, compiled_patterns)
            if matches:
                for match in matches:
                    main_category, sub_category, pattern = match
                    print(f"Main Category: {main_category}, Subcategory: {sub_category}, Pattern: {pattern}")
            else:
                print("No matches found")

#USAGE:
path_example_logfile = 'preprocessed_logs/882dafcc748b4f0695ce486241a05ad2_cropped.json'

# Assuming compiled_infra_patterns and compiled_build_patterns are already defined
# Combine all compiled patterns into a single dictionary before processing the log file
all_compiled_patterns = {**compiled_infra_patterns, **compiled_build_patterns}
process_log_file(path_example_logfile, all_compiled_patterns)


#######################################################################
#### For INFRA_PATTERN, file: ef7ce17ae108085e8a26e1f2046e1cf73e8c976_cropped.json

def process_log_file(file_path, compiled_patterns):
    with open(file_path, 'r') as file:
        log_entries = json.load(file)
        
        # Iterate over the log entries
        for entry in log_entries:
            # Concatenate the stdout_lines if present into a single string
            stdout_text = "\n".join(entry['stdout_lines']) if 'stdout_lines' in entry else ""
            # Check the concatenated stdout_lines against the compiled regex patterns
            matches = check_log_entry(stdout_text, compiled_patterns)
            if matches:
                for match in matches:
                    main_category, sub_category, pattern = match
                    print(f"Main Category: {main_category}, Subcategory: {sub_category}, Pattern: {pattern}")
            else:
                print("No matches found")


# Function to check a log entry against compiled patterns
def check_log_entry(log_entry, compiled_patterns):
    matches = []
    for (main_category, sub_category), regex_list in compiled_patterns.items():
        for regex in regex_list:
            if regex.search(log_entry):
                # If a subcategory exists, include it in the output
                category = f"{main_category} - {sub_category}" if sub_category else main_category
                # Append a tuple with the category, subcategory, and the matching pattern
                matches.append((category, sub_category, regex.pattern))
    return matches

#USAGE:
path_example_logfile = 'preprocessed_logs/9ef7ce17ae108085e8a26e1f2046e1cf73e8c976_cropped.json'

process_log_file(path_example_logfile, compiled_infra_patterns)

#######################################################################
#### same for build patterns

def process_log_file(file_path, compiled_patterns):
    with open(file_path, 'r') as file:
        log_entries = json.load(file)
        
        # Iterate over the log entries
        for entry in log_entries:
            # Concatenate the stdout_lines if present into a single string
            stdout_text = "\n".join(entry['stdout_lines']) if 'stdout_lines' in entry else ""
            # Check the concatenated stdout_lines against the compiled regex patterns
            matches = check_log_entry(stdout_text, compiled_patterns)
            if matches:
                for match in matches:
                    main_category, sub_category, pattern = match
                    print(f"Main Category: {main_category}, Subcategory: {sub_category}, Pattern: {pattern}")
            else:
                print("No matches found")


# Function to check a log entry against compiled patterns
def check_log_entry(log_entry, compiled_patterns):
    matches = []
    for (main_category, sub_category), regex_list in compiled_patterns.items():
        for regex in regex_list:
            if regex.search(log_entry):
                # If a subcategory exists, include it in the output
                category = f"{main_category} - {sub_category}" if sub_category else main_category
                # Append a tuple with the category, subcategory, and the matching pattern
                matches.append((category, sub_category, regex.pattern))
    return matches

#USAGE:
path_example_logfile = 'preprocessed_logs/9ef7ce17ae108085e8a26e1f2046e1cf73e8c976_cropped.json'

process_log_file(path_example_logfile, compiled_build_patterns)