# Create a summary of the dj columns

In [None]:
############## Create Summary column for the data ################
import pandas as pd
import numpy as np

# Synthetize the alimentation
data = pd.read_csv('sncb_data_challenge.csv', sep=';')


def str_to_bool_list(string):
    # convert string to list of boolean
    if pd.isna(string):
        return []
    return [s.strip() in 'True' for s in string.strip('[]').split(',')]

# Convert string to list of boolean
col_list_bool = ['dj_ac_state_sequence', 'dj_dc_state_sequence']

for col in col_list_bool:
    data[col] = data[col].apply(str_to_bool_list)




In [None]:
data['events_sequence'] = data['events_sequence'].apply(lambda x: list(map(int, x.strip('[]').split(','))))

In [None]:
# Summary column of the alimentation that will be had to the model
data['summary'] = None
for i in range(len(data[col_list_bool[0]])):
    summary_list = []
    for j in range(len(data[col_list_bool[0]][i])):
        if data[col_list_bool[0]][i][j] and data[col_list_bool[1]][i][j]:
            summary_list.append('AC/DC')
        if data[col_list_bool[0]][i][j] and not data[col_list_bool[1]][i][j]:
            summary_list.append('AC')
        elif not data[col_list_bool[0]][i][j] and data[col_list_bool[1]][i][j]:
            summary_list.append('DC')
        else:
            summary_list.append('Battery')
    data.at[i, 'summary'] = summary_list


### Work on the sequences of  the summary alone

In [None]:
# Determination of the sequence of the alimentation alone
data['summary_alone'] = None
for i in range(len(data['summary'])):
    new_summary_list = []
    for j in range(len(data['summary'][i])):
        if j==0 or data['summary'][i][j] != data['summary'][i][j-1]:
            new_summary_list.append(data['summary'][i][j])
    data.at[i, 'summary_alone'] = new_summary_list
            

In [None]:
############################# Frequent itemsets (FP-Growth) #############################

import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
import ast  # For safely evaluating string representations of lists


def find_frequent_itemsets_fp_growth(data, min_support=0.3):
    """
    Finds the most frequent sequences of events for each incident type using FP-Growth.
    """
    # Get all unique incident types
    incident_types = data['incident_type'].unique()
    results = {}

    for incident in incident_types:
        # Check if the csv file already exists
        try:
            most_frequent = pd.read_csv(f'results\\results2\\results2_{incident}.csv', sep=';')
            if most_frequent is not None:
                results[incident] = most_frequent
                continue
        except:
            pass

        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each transaction is a sequence of events
        transactions = filtered_data['summary_alone']

        # Create a one-hot encoded DataFrame for the events
        unique_events = set(alimentation for sequence in transactions for alimentation in sequence)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in sequence) for event in unique_events} for sequence in transactions
        ])
        # Apply FP-Growth algorithm
        frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)

        # Sort by support and keep top results
        if not frequent_itemsets.empty:
            most_frequent = frequent_itemsets.sort_values(by='support', ascending=False)
            most_frequent['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: list(x))
            results[incident] = most_frequent
        else:
            results[incident] = None
        
        # store the results in a csv file
        most_frequent.to_csv(f'results\\results2\\results2_{incident}.csv', sep=';', index=False)
    # Run for all the database
    transactions = data['summary_alone']
    unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
    transaction_df = pd.DataFrame([
        {event: (event in sequence) for event in unique_events} for sequence in transactions
    ])
    database_frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)
    database_frequent_itemsets = database_frequent_itemsets.sort_values(by='support', ascending=False)
    database_frequent_itemsets['itemsets'] = database_frequent_itemsets['itemsets'].apply(lambda x: list(x))
    database_frequent_itemsets.to_csv(f'results\\results2\\results_database2.csv', sep=';', index=False)
    return results

# Run the function
results = find_frequent_itemsets_fp_growth(data)

# Display the results
for incident, frequent in results.items():
    print(f"Incident Type: {incident}")
    if frequent is not None:
        print(frequent)
    else:
        print("No frequent sequences found.")


### Work on a the combination between event and the summary colomn

In [None]:
# Determine the most frequent sequence of alimentation combined with the event column

data['events + summary'] = None
for i in range(len(data['summary'])):
    ev_sum_list = []
    for j in range(len(data['summary'][i])):
        ev_sum_list.append((data['events_sequence'][i][j], data['summary'][i][j]))
    data.at[i, 'events + summary'] = ev_sum_list


In [None]:
# Get rid of duplicates in the 'events + summary' column

for i in range(len(data['events + summary'])):
    new_ev_sum_list = []
    for j in range(len(data['events + summary'][i])):
        if j == 0 or data['events + summary'][i][j] != data['events + summary'][i][j-1]:
            new_ev_sum_list.append(data['events + summary'][i][j])
    data.at[i, 'events + summary'] = new_ev_sum_list

In [None]:
# Save the new alimentation
data.to_csv('sncb_alimentation.csv', sep=';', index=False)