### sequence_to_eventlog

In [None]:
import pandas as pd
import os
from tqdm import tqdm

def sequences_to_eventlogs(sequence_files, output_path="processed_data/sequences"):

    all_dfs = []

    # Process each sequence file in the list
    for sequence_file in sequence_files:

        print(sequence_file)

        # Open the file and process each line
        with open(sequence_file, 'r') as file:
            classes = []
            indices = []
            data_values = []
            remaining_counts = []
            percentage_alpha_counts = []
            percentage_beta_counts = []
            line_numbers = []  # New column for line numbers

            for line_number, line in tqdm(enumerate(file, start=1)):
                # Split the line into class and sequence recognized by tab
                parts = line.strip().split('\t')
                class_label = parts[0]
                sequence_data = parts[1]

                # Process each data element in the sequence
                for index, data_value in enumerate(sequence_data.split()):
                    classes.append(class_label)
                    indices.append(index)
                    data_values.append(data_value)
                    line_numbers.append(line_number)

                    # Calculate the remaining count
                    remaining_sequence_data = ' '.join(sequence_data.split()[index + 1:])
                    remaining_count = remaining_sequence_data.count(data_value)
                    remaining_counts.append(remaining_count)

                    # Calculate Percentage_Alpha (remaining of the data divided by remained elements to visit)
                    total_remaining_alpha = len(remaining_sequence_data.split())
                    percentage_alpha_count = 0 if total_remaining_alpha == 0 else remaining_count / total_remaining_alpha
                    percentage_alpha_counts.append(percentage_alpha_count)

                    # Calculate Percentage_Beta (remaining of the data divided by remained data to visit)
                    total_occurrences_beta = sequence_data.count(data_value)
                    if total_occurrences_beta == 0:
                        percentage_beta_count = 0  # Avoid division by zero
                    else:
                        percentage_beta_count = remaining_sequence_data.count(data_value) / total_occurrences_beta
                    percentage_beta_counts.append(percentage_beta_count)

            # Create a DataFrame from the lists
            df = pd.DataFrame({
                'Class': classes,
                'Index': indices,
                'Data': data_values,
                'Remaining': remaining_counts,
                'Percentage_Alpha': percentage_alpha_counts,
                'Percentage_Beta': percentage_beta_counts,
                'I': line_numbers
            })

            # Save individual DataFrame as CSV
            file_name = os.path.splitext(os.path.basename(sequence_file))[0]  # Extract filename without extension
            output_file = os.path.join(output_path, f"{file_name}_eventlogs.csv")
            df.to_csv(output_file, index=False)

            df["dataset"], _ = os.path.splitext(os.path.basename(file_name))

            # Append the DataFrame to the list for later concatenation
            all_dfs.append(df)

    # Concatenate all individual DataFrames into one total DataFrame
    total_df = pd.concat(all_dfs, ignore_index=True)

    return total_df


In [None]:
# Example usage:
sequence_files_text = [] #lists of sequences file in the format class sequence

# Get the total DataFrame
eventlogs_df = sequences_to_eventlogs(sequence_files_text)
print("Total Eventlogs DataFrame")
print(eventlogs_df)

# Save the total DataFrame as CSV
eventlogs_df.to_csv("processed_data/sequences/") #oick a name for the full file collect all the event logs converted from sequences

In [None]:
eventlogs_df