In [None]:
import csv
import re
from collections import defaultdict, OrderedDict

In [None]:
base_dir = '/Users/pragatibhingare/projects/logbert/output/openstack/'
input_csv_file = base_dir + 'openstack_merged.log_structured.csv'
output_csv_file = base_dir + 'anomaly_label.csv'
anomaly_instance_ids = [
        '544fd51c-4edc-4780-baae-ba1d80a0acfc',
        'ae651dff-c7ad-43d6-ac96-bbcd820ccca8',
        'a445709b-6ad0-40ec-8860-bec60b6ca0c2',
        '1643649d-2f42-4303-bfcd-7798baec19f9'
    ]

In [None]:
class LogDataProcessor:
    def append_labels_to_csv(self, input_file, output_file, anomaly_instance_ids):
        def is_anomaly(content):
            for instance_id in anomaly_instance_ids:
                if instance_id in content:
                    return True
            return False

        with open(input_file, 'r') as input_csv, open(output_file, 'w', newline='') as output_csv:
            reader = csv.DictReader(input_csv)
            fieldnames = reader.fieldnames + ['Label']
            writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
            writer.writeheader()
            for row in reader:
                content = row['Content']
                label = 'ANOMALY' if is_anomaly(content) else 'NORMAL'
                row['Label'] = label
                writer.writerow(row)
        print("Labels appended successfully.")

    def filter_csv(self, input_csv_path, output_csv_path, selected_columns):
        with open(input_csv_path, 'r') as input_csv, open(output_csv_path, 'w', newline='') as output_csv:
            reader = csv.reader(input_csv)
            writer = csv.writer(output_csv)

            header = next(reader)
            selected_indices = [header.index(column) for column in selected_columns]
            writer.writerow(selected_columns)

            for row in reader:
                filtered_row = [row[index] for index in selected_indices]
                writer.writerow(filtered_row)

        print("Filtered CSV file generated successfully.")

    def extract_unique_labels(self, input_csv_path, output_csv_path):
        unique_labels = defaultdict(list)

        with open(input_csv_path, 'r') as input_csv:
            reader = csv.DictReader(input_csv)
            for row in reader:
                content = row['Content']
                label = row['Label']
                instance_id = content.split('[instance: ')[-1].split(']')[0]
                unique_labels[instance_id].append(label)

        with open(output_csv_path, 'w', newline='') as output_csv:
            writer = csv.writer(output_csv)
            writer.writerow(['instance_id', 'Labels'])  # Write the header
            for instance_id, labels in unique_labels.items():
                writer.writerow([instance_id, ', '.join(labels)])

        print('Unique instance IDs and labels extracted successfully.')

    def generate_unique_labels_csv(self, input_csv_path, output_csv_path):
        instance_ids = OrderedDict()

        pattern = r'\[instance: ([a-fA-F0-9-]+)\]|image ([a-fA-F0-9-]+)|instance ([a-fA-F0-9-]+)'

        # Read the input CSV file and extract unique instance IDs and their labels
        with open(input_csv_path, 'r') as input_csv:
            reader = csv.DictReader(input_csv)
            for row in reader:
                content = row['Content']
                label = row['Label']
                matches = re.findall(pattern, content)
                for match in matches:
                    instance_id = next(filter(None, match))
                    if label in ['NORMAL', 'ANOMALY']:
                        instance_ids.setdefault(instance_id, []).append(label)

        with open(output_csv_path, 'w', newline='') as output_csv:
            writer = csv.writer(output_csv)
            writer.writerow(['instance_id', 'Labels'])
            for instance_id, labels in instance_ids.items():
                unique_labels = set(labels)
                writer.writerow([instance_id, ','.join(unique_labels)])

        print('Unique labels CSV file generated successfully.')

    def filter_unique_labels_csv(self, input_csv_path, output_csv_path):
        with open(input_csv_path, 'r') as input_csv:
            reader = csv.reader(input_csv)
            rows = list(reader)
        if len(rows) > 4:
            del rows[3]
        with open(output_csv_path, 'w', newline='') as output_csv:
            writer = csv.writer(output_csv)
            writer.writerows(rows)

        print('Filtered unique labels CSV file generated successfully.')

In [None]:
def main():
    processor = LogDataProcessor()
    processor.append_labels_to_csv(input_csv_file, output_csv_file, anomaly_instance_ids)

    input_csv_path = base_dir + 'anomaly_label.csv'
    output_csv_path = base_dir + 'anomaly_label_filtered.csv'
    selected_columns = ['ADDR', 'Content', 'Label']
    processor.filter_csv(input_csv_path, output_csv_path, selected_columns)

    input_csv_path = base_dir + 'anomaly_label_filtered.csv'
    output_csv_path = base_dir + 'unique_label.csv'
    processor.extract_unique_labels(input_csv_path, output_csv_path)

    input_csv_path = base_dir + 'anomaly_label_filtered.csv'
    output_csv_path = base_dir + 'unique_label.csv'
    processor.generate_unique_labels_csv(input_csv_path, output_csv_path)

    input_csv_path = base_dir + 'unique_label.csv'
    output_csv_path = base_dir + 'unique_label_filtered.csv'
    processor.filter_unique_labels_csv(input_csv_path, output_csv_path)

if __name__ == "__main__":
    main()