# Create a summary of the dj columns

In [18]:
############## Create Summary column for the data ################
import pandas as pd
import numpy as np
import ast

# Synthetize the speed of the train
data = pd.read_csv('sncb_alimentation.csv', sep=';')


In [19]:
data['train_kph_sequence'] = data['train_kph_sequence'].apply(lambda x: list(map(float, x.strip('[]').split(','))))

In [20]:
# Summary column of the tran_kph_sequence
data['speed'] = None
for i in range(len(data['train_kph_sequence'])):
    summary_list = []
    for j in range(len(data['train_kph_sequence'][i])):
        if data['train_kph_sequence'][i][j] == 0.0:
            summary_list.append('Stopped')
        elif data['train_kph_sequence'][i][j] < 50.0:
            summary_list.append(' < 50')
        elif data['train_kph_sequence'][i][j] < 100.0:
            summary_list.append(' < 100')
        else:
            summary_list.append(' >= 100')
    data.at[i, 'speed'] = summary_list


### Work on the sequences of  the summary alone

In [21]:
# Determination of the sequence of the speed alone
data['speed_alone'] = None
for i in range(len(data['speed'])):
    new_summary_list = []
    for j in range(len(data['speed'][i])):
        if j==0 or data['speed'][i][j] != data['speed'][i][j-1]:
            new_summary_list.append(data['speed'][i][j])
    data.at[i, 'speed_alone'] = new_summary_list

In [8]:
############################# Frequent itemsets (FP-Growth) #############################

import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
import ast  # For safely evaluating string representations of lists


def find_frequent_itemsets_fp_growth(data, min_support=0.8):
    """
    Finds the most frequent sequences of events for each incident type using FP-Growth.
    """
    # Get all unique incident types
    incident_types = data['incident_type'].unique()
    results = {}

    for incident in incident_types:
        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each transaction is a sequence of events
        transactions = filtered_data['speed_alone']

        # Create a one-hot encoded DataFrame for the events
        unique_events = set(speed for sequence in transactions for speed in sequence)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in sequence) for event in unique_events} for sequence in transactions
        ])
        # Apply FP-Growth algorithm
        frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)

        # Sort by support and keep top results
        if not frequent_itemsets.empty:
            most_frequent = frequent_itemsets.sort_values(by='support', ascending=False)
            most_frequent['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: list(x))
            results[incident] = most_frequent
            # store the results in a csv file
            most_frequent.to_csv(f'results\\results4\\results4_{incident}.csv', sep=';', index=False)
        else:
            results[incident] = None
        
    # Run for all the database
    transactions = data['speed_alone']
    unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
    transaction_df = pd.DataFrame([
        {event: (event in sequence) for event in unique_events} for sequence in transactions
    ])
    database_frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)
    database_frequent_itemsets = database_frequent_itemsets.sort_values(by='support', ascending=False)
    database_frequent_itemsets['itemsets'] = database_frequent_itemsets['itemsets'].apply(lambda x: list(x))
    database_frequent_itemsets.to_csv(f'results\\results4\\results_database4.csv', sep=';', index=False)
    return results

# Run the function
results = find_frequent_itemsets_fp_growth(data)

# Display the results
for incident, frequent in results.items():
    print(f"Incident Type: {incident}")
    if frequent is not None:
        print(frequent)
    else:
        print("No frequent sequences found.")


Incident Type: 4
     support                           itemsets
0   1.000000                          [Stopped]
1   0.910256                            [ < 50]
4   0.910256                   [Stopped,  < 50]
2   0.846154                           [ < 100]
5   0.846154                  [Stopped,  < 100]
10  0.846154           [Stopped,  < 50,  < 100]
7   0.846154                    [ < 50,  < 100]
6   0.807692                 [ >= 100, Stopped]
3   0.807692                          [ >= 100]
8   0.807692                   [ >= 100,  < 50]
9   0.807692                  [ >= 100,  < 100]
11  0.807692          [ >= 100, Stopped,  < 50]
12  0.807692         [ >= 100, Stopped,  < 100]
13  0.807692           [ >= 100,  < 50,  < 100]
14  0.807692  [ >= 100, Stopped,  < 50,  < 100]
Incident Type: 13
     support                           itemsets
0   1.000000                          [Stopped]
1   0.952830                            [ < 50]
4   0.952830                   [Stopped,  < 50]
2   0

### Work on a the combination between event and the summary colomn

In [22]:
# Determine the most frequent sequence of speed for each incident type

data['events + speed'] = None
data['events_sequence']  = data['events_sequence'].apply(lambda x: list(map(int, x.strip('[]').split(','))))
for i in range(len(data['speed'])):
    ev_sum_list = []
    for j in range(len(data['speed'][i])):
        ev_sum_list.append((data['events_sequence'][i][j], data['speed'][i][j]))
    data.at[i, 'events + speed'] = ev_sum_list


In [23]:
# Get rid of duplicates in the 'events + speed' column

for i in range(len(data['events + speed'])):
    new_ev_sum_list = []
    for j in range(len(data['events + speed'][i])):
        if j == 0 or data['events + speed'][i][j] != data['events + speed'][i][j-1]:
            new_ev_sum_list.append(data['events + speed'][i][j])
    data.at[i, 'events + speed'] = new_ev_sum_list

### Work on the combination between event, speed and alimentation colomn

In [None]:
# Determine the most frequent sequence of event, speed, alimentation for each incident type
data['events + speed + alimentation'] = None
for i in range(len(data['speed'])):
    ev_sum_list = []
    if len(data['events_sequence'][i]) != len(data['speed'][i]) or len(data['events_sequence'][i]) != len(data['summary'][i]):
        print(f"Error: events and speed lists have different lengths for incident {i}.")
        continue
    for j in range(len(data['speed'][i])):
        ev_sum_list.append((data['events_sequence'][i][j], data['speed'][i][j], data['summary'][i][j]))
    data.at[i, 'events + speed + alimentation'] = ev_sum_list


0       [Battery, Battery, Battery, Battery, Battery, ...
1       [DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, D...
2       [DC, DC, DC, DC, DC, Battery, DC, DC, DC, DC, ...
3       [DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, D...
4       [Battery, Battery, Battery, Battery, Battery, ...
                              ...                        
1006    [Battery, Battery, Battery, Battery, Battery, ...
1007    [DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, D...
1008    [DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, DC, D...
1009    [AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, A...
1010    [Battery, Battery, Battery, Battery, Battery, ...
Name: summary, Length: 1011, dtype: object


In [30]:
# Get rid of duplicates in the 'events + speed + alimentation' column

for i in range(len(data['events + speed + alimentation'])):
    new_ev_sum_list = []
    for j in range(len(data['events + speed + alimentation'][i])):
        if j == 0 or data['events + speed + alimentation'][i][j] != data['events + speed + alimentation'][i][j-1]:
            new_ev_sum_list.append(data['events + speed + alimentation'][i][j])
    data.at[i, 'events + speed + alimentation'] = new_ev_sum_list

### Work on the combination between speed and alimentation colomn

In [34]:
# Determine the most frequent sequence of speed, alimentation for each incident type
data['speed + alimentation'] = None
for i in range(len(data['speed'])):
    ev_sum_list = []
    if  len(data['speed'][i]) != len(data['summary'][i]):
        print(f"Error: speed and alimentation lists have different lengths for incident {i}.")
        continue
    for j in range(len(data['speed'][i])):
        ev_sum_list.append((data['speed'][i][j], data['summary'][i][j]))
    data.at[i, 'speed + alimentation'] = ev_sum_list


In [35]:
# Get rid of duplicates in the 'events + speed + alimentation' column

for i in range(len(data['speed + alimentation'])):
    new_ev_sum_list = []
    for j in range(len(data['speed + alimentation'][i])):
        if j == 0 or data['speed + alimentation'][i][j] != data['speed + alimentation'][i][j-1]:
            new_ev_sum_list.append(data['speed + alimentation'][i][j])
    data.at[i, 'speed + alimentation'] = new_ev_sum_list

In [36]:
############################# Frequent itemsets (FP-Growth) #############################

import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
import ast  # For safely evaluating string representations of lists


def find_frequent_itemsets_fp_growth(data, min_support=0.7):
    """
    Finds the most frequent sequences of events for each incident type using FP-Growth.
    """
    # Get all unique incident types
    incident_types = data['incident_type'].unique()
    results = {}

    for incident in incident_types:
        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each transaction is a sequence of events
        transactions = filtered_data['speed + alimentation']

        # Create a one-hot encoded DataFrame for the events
        unique_events = set(speed for sequence in transactions for speed in sequence)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in sequence) for event in unique_events} for sequence in transactions
        ])
        # Apply FP-Growth algorithm
        frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)

        # Sort by support and keep top results
        if not frequent_itemsets.empty:
            most_frequent = frequent_itemsets.sort_values(by='support', ascending=False)
            most_frequent['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: list(x))
            results[incident] = most_frequent
            # store the results in a csv file
            most_frequent.to_csv(f'results\\results6\\results6_{incident}.csv', sep=';', index=False)
        else:
            results[incident] = None
        
    # Run for all the database
    transactions = data['speed + alimentation']
    unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
    transaction_df = pd.DataFrame([
        {event: (event in sequence) for event in unique_events} for sequence in transactions
    ])
    database_frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)
    database_frequent_itemsets = database_frequent_itemsets.sort_values(by='support', ascending=False)
    database_frequent_itemsets['itemsets'] = database_frequent_itemsets['itemsets'].apply(lambda x: list(x))
    database_frequent_itemsets.to_csv(f'results\\results6\\results_database6.csv', sep=';', index=False)
    return results

# Run the function
results = find_frequent_itemsets_fp_growth(data)

# Display the results
for incident, frequent in results.items():
    print(f"Incident Type: {incident}")
    if frequent is not None:
        print(frequent)
    else:
        print("No frequent sequences found.")


Incident Type: 4
     support                                           itemsets
0   0.948718                                    [(Stopped, DC)]
1   0.884615                                      [( < 50, DC)]
5   0.871795                       [(Stopped, DC), ( < 50, DC)]
2   0.833333                               [(Stopped, Battery)]
3   0.833333                                     [( < 100, DC)]
9   0.833333                        [( < 100, DC), ( < 50, DC)]
10  0.820513                      [(Stopped, DC), ( < 100, DC)]
11  0.820513         [(Stopped, DC), ( < 100, DC), ( < 50, DC)]
6   0.782051                [(Stopped, Battery), (Stopped, DC)]
12  0.756410                      [( < 100, DC), ( >= 100, DC)]
4   0.756410                                    [( >= 100, DC)]
13  0.756410                       [( >= 100, DC), ( < 50, DC)]
15  0.756410         [( < 100, DC), ( >= 100, DC), ( < 50, DC)]
14  0.743590                     [(Stopped, DC), ( >= 100, DC)]
16  0.743590       [(St

In [32]:
# Save the new alimentation
data.to_csv('sncb_speed.csv', sep=';', index=False)