In [1]:
#

In [2]:
############################# Data preparation script for the SNCB Data Challenge #############################

import numpy as np
import pandas as pd

MIN_BEFORE_INCIDENT = 15
MIN_AFTER_INCIDENT = 5
   
# Read CSV file
data = pd.read_csv('sncb_data_challenge.csv', sep=';')

#Convert string to list of integers
col_list_int = ['vehicles_sequence', 'events_sequence','seconds_to_incident_sequence']
for col in col_list_int:
    data[col] = data[col].apply(lambda x: list(map(int, x.strip('[]').split(','))))

col_list_float = ['train_kph_sequence']
for col in col_list_float:
    data[col] = data[col].apply(lambda x: list(map(float, x.strip('[]').split(','))))

col_list_str = ['dj_ac_state_sequence', 'dj_dc_state_sequence']
for col in col_list_str:
    data[col] = data[col].apply(lambda x: list(map(str, x.strip('[]').split(','))))

#Convert string to list of floats
# data['train_kph_sequence'] = data['train_kph_sequence'].apply(lambda x: list(map(float, x.strip('[]').split(','))))

# Compute the acceleration
data['acceleration_seq'] = data.apply(
    lambda row: [
        (row['train_kph_sequence'][i + 1] - row['train_kph_sequence'][i]) / 
        (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i])
        if (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i]) != 0 and row['vehicles_sequence'][i+1] == row['vehicles_sequence'][i] else np.nan
        for i in range(len(row['train_kph_sequence']) - 1)
    ], axis=1)

for i in range(len(data['events_sequence'])):
    new_vehicles_sequence = []
    new_events_sequence = []
    new_seconds_to_incident_sequence = []
    new_train_kph_sequence = []
    new_dj_ac_state_sequence = []
    new_dj_dc_state_sequence = []
    new_acceleration_seq = []
    
    start_index = 0
    for j in range(len(data['events_sequence'][i])):
        is_before_incident = data['seconds_to_incident_sequence'][i][j] >= -MIN_BEFORE_INCIDENT * 60
        is_after_incident = data['seconds_to_incident_sequence'][i][j] <= MIN_AFTER_INCIDENT * 60
        time_condition = is_before_incident and is_after_incident
        if not time_condition:
            start_index += 1
            continue

        
        if j == start_index or data['events_sequence'][i][j] != new_events_sequence[-1]:
            new_vehicles_sequence.append(data['vehicles_sequence'][i][j])
            new_events_sequence.append(data['events_sequence'][i][j])
            new_seconds_to_incident_sequence.append(data['seconds_to_incident_sequence'][i][j])
            new_train_kph_sequence.append(data['train_kph_sequence'][i][j])
            new_dj_ac_state_sequence.append(data['dj_ac_state_sequence'][i][j])
            new_dj_dc_state_sequence.append(data['dj_dc_state_sequence'][i][j])
            if j < len(data['acceleration_seq'][i]):
                new_acceleration_seq.append(data['acceleration_seq'][i][j])
    
    data.at[i, 'vehicles_sequence'] = new_vehicles_sequence
    data.at[i, 'events_sequence'] = new_events_sequence
    data.at[i, 'seconds_to_incident_sequence'] = new_seconds_to_incident_sequence
    data.at[i, 'train_kph_sequence'] = new_train_kph_sequence
    data.at[i, 'dj_ac_state_sequence'] = new_dj_ac_state_sequence
    data.at[i, 'dj_dc_state_sequence'] = new_dj_dc_state_sequence
    data.at[i, 'acceleration_seq'] = new_acceleration_seq

for i in range(len(data['events_sequence'])):
    for j in range(len(data['events_sequence'][i]) - 1):
        if data['events_sequence'][i][j] == data['events_sequence'][i][j+1]:
            print("duplicates")
            print(i)
            print(len(data['events_sequence'][i]))
            print(j)
            raise ValueError("Duplicates in events_sequence")
    
# Save the modified DataFrame to a new CSV file
data.to_csv('sncb_prepared.csv', sep=';', index=False)

In [3]:
############################# Fp-Growth (OLD) #############################
#Find the most frequent sequence of events for each type of incident in the dataset using the FP-Growth algorithm
data = pd.read_csv('sncb_prepared.csv', sep=';')

def fb_growth(data, min_support=0.1):
    # Create a dictionary to store the support of each item
    support = {}
    for index, row in data.iterrows():
        for event in row['events_sequence']:
            if event in support:
                support[event] += 1
            else:
                support[event] = 1

    # Filter the items that have a support greater than the minimum support
    frequent_items = {k: v for k, v in support.items() if v / len(data) >= min_support}

    # Create a dictionary to store the support of each itemset
    support = {}
    for index, row in data.iterrows():
        for i in range(len(row['events_sequence'])):
            for j in range(i + 1, len(row['events_sequence'])):
                if row['events_sequence'][i] in frequent_items and row['events_sequence'][j] in frequent_items:
                    if (row['events_sequence'][i], row['events_sequence'][j]) in support:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] += 1
                    else:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] = 1

    # Filter the itemsets that have a support greater than the minimum support
    frequent_itemsets = {k: v for k, v in support.items() if v / len(data) >= min_support}

    return frequent_itemsets

print(fb_growth(data, min_support=0.1))

KeyboardInterrupt: 

In [4]:
############################# Frequent itemsets (FP-Growth) #############################

import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
import ast  # For safely evaluating string representations of lists


def find_frequent_itemsets_fp_growth(data, min_support=0.9):
    """
    Finds the most frequent sequences of events for each incident type using FP-Growth.
    """
    # Get all unique incident types
    incident_types = data['incident_type'].unique()
    results = {}

    # Convert stringified lists to actual lists of integers
    data['events_sequence'] = data['events_sequence'].apply(lambda x: ast.literal_eval(x))

    for incident in incident_types:
        # Check if the csv file already exists
        try:
            most_frequent = pd.read_csv(f'results\\results_{incident}.csv', sep=';')
            if most_frequent is not None:
                results[incident] = most_frequent
                continue
        except:
            pass

        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each transaction is a sequence of events
        transactions = filtered_data['events_sequence'].tolist()

        # Create a one-hot encoded DataFrame for the events
        unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in sequence) for event in unique_events} for sequence in transactions
        ])
        # Apply FP-Growth algorithm
        frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)

        # Sort by support and keep top results
        if not frequent_itemsets.empty:
            most_frequent = frequent_itemsets.sort_values(by='support', ascending=False)
            most_frequent['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: list(x))
            results[incident] = most_frequent
        else:
            results[incident] = None
        
        # store the results in a csv file
        most_frequent.to_csv(f'results\\results_{incident}.csv', sep=';', index=False)
    # Run for all the database
    transactions = data['events_sequence'].tolist()
    unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
    transaction_df = pd.DataFrame([
        {event: (event in sequence) for event in unique_events} for sequence in transactions
    ])
    database_frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)
    database_frequent_itemsets = database_frequent_itemsets.sort_values(by='support', ascending=False)
    database_frequent_itemsets['itemsets'] = database_frequent_itemsets['itemsets'].apply(lambda x: list(x))
    database_frequent_itemsets.to_csv(f'results\\results_database.csv', sep=';', index=False)
    return results

# Load the dataset
data = pd.read_csv('sncb_prepared.csv', sep=';')

# Run the function
results = find_frequent_itemsets_fp_growth(data)

# Display the results
for incident, frequent in results.items():
    print(f"Incident Type: {incident}")
    if frequent is not None:
        print(frequent)
    else:
        print("No frequent sequences found.")


UnboundLocalError: cannot access local variable 'most_frequent' where it is not associated with a value

In [6]:
############################# Frequent sequences #############################
import pandas as pd

def find_frequent_sequences_fp_growth(data, threshold=0.9):
    """
    Finds the most frequent sequences of events for each incident type using GSP.
    """
    # Get all unique incident types
    incident_types = data['incident_type'].unique()
    results = {}

    # Convert stringified lists to actual lists of integers
    data['events_sequence'] = data['events_sequence'].apply(lambda x: ast.literal_eval(x))

    for incident in incident_types:
        # Check if the csv file already exists
        try:
            most_frequent = pd.read_csv(f'results\\results_{incident}.csv', sep=';')
            results[incident] = most_frequent
            continue
        except:
            pass

        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each transaction is a sequence of events
        transactions = filtered_data['events_sequence'].tolist()

        # Create a one-hot encoded DataFrame for the events
        unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in sequence) for event in unique_events} for sequence in transactions
        ])
        
        count_list = np.zeros(len(unique_events))
        for i in range(len(unique_events)):
            count_list[i] = transaction_df[unique_events[i]].sum()

        for i in range(len(unique_events)):
            if count_list[i] < threshold*len(transactions):
                # transaction_df.drop(columns=[unique_events[i]], inplace=True)
                unique_events.remove(unique_events[i])

        frequent_sequences = unique_events
        new_frq_seq = unique_events
        old_frq_seq = unique_events
        max_time_diff = 50 # TODO: Define a proper value
        while len(new_frq_seq) > 0:
            new_frq_seq = []
            time_diff = max_time_diff
            
        
        



In [7]:
import os
import pandas as pd

itemsets_to_not_add = pd.read_csv('results\\results_database.csv', sep=';')

frequent_itmesets = []
# read all the files in the results folder
for filename in os.listdir('results'):
    if filename == 'results_database.csv':
        continue
    incident_frequent_itemset = pd.read_csv(f'results\\{filename}', sep=';')
    for index, row in incident_frequent_itemset.iterrows():
        if row['itemsets'] not in frequent_itmesets and row['itemsets'] not in itemsets_to_not_add['itemsets']:
            frequent_itmesets.append(row['itemsets'])

frequent_itmesets = pd.DataFrame(frequent_itmesets, columns=['itemsets'])
frequent_itmesets.to_csv('results\\results_frequent.csv', sep=';', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'results\\results_database.csv'

In [None]:
import pandas as pd
import time

# Load the dataset
data = pd.read_csv('sncb_prepared.csv', sep=';')
frequent_itmesets = pd.read_csv('results\\results_frequent.csv', sep=';')
columns = frequent_itmesets['itemsets'].tolist()
columns.append('incident_type')
final_data = pd.DataFrame(columns=columns)

print(final_data.columns)

# Apply the one hot encoding
for index, row in data.iterrows():
    
    one_hot_encoding = []
    for itemset in frequent_itmesets['itemsets']:
        itemset = list(itemset.strip('[]').split(','))
        events_sequence = list(row['events_sequence'].strip('[]').split(','))
        if all(event in events_sequence for event in itemset):
            one_hot_encoding.append(1)
        else:
            one_hot_encoding.append(0)
    one_hot_encoding.append(row['incident_type'])
    final_data = pd.concat([final_data, pd.DataFrame([one_hot_encoding], columns=columns)], ignore_index=True)

final_data.to_csv('sncb_final.csv', sep=';', index=False)

Index(['[2708]', '[4066]', '[4068]', '[4394]', '[2956]', '[3658]', '[3636]',
       '[3658, 3636]', '[4168]', '[4140]', '[3658, 2956]', '[2956, 3636]',
       '[2956, 3658, 3636]', '[2742]', '[4148]', '[4026]', '[4026, 2708]',
       '[4066, 2708]', '[3658, 2708]', '[3658, 2708, 3636]', '[2708, 4068]',
       '[2708, 3636]', '[3658, 4066]', '[2956, 2708]', '[4016]',
       '[3658, 2708, 4066]', '[4016, 2708]', '[4066, 3636]',
       '[3658, 3636, 4066]', '[4066, 2956]', '[3658, 4068]', '[4066, 4068]',
       '[4066, 2708, 3636]', '[2708, 4066, 4068]', '[3658, 2708, 3636, 4066]',
       '[3658, 4068, 3636]', '[4068, 3636]', '[2956, 4066, 2708]',
       '[4016, 4026]', '[4124]', '[2956, 4124]', '[4066, 4124]',
       '[4066, 2956, 4124]', '[4068, 3658, 2956]', '[4068, 2956, 3636]',
       '[3658, 2956, 3636]', '[4068, 2956]', '[4068, 3658, 2956, 3636]',
       'incident_type'],
      dtype='object')


In [None]:
for index, row in final_data.iterrows():
    print(sum(row))

144
311
23
1025
68
412
24
54
99
9
299
14
1275
311
327
2505
2
144
25
511
19
4
379
584
311
1251
305
128
128
1236
50
65
13
13
1389
9
703
69
89
13
153
99
13
43
103
1114
130
9
42
1708
688
2
447
255
13
2602
13
54
108
13
122
637
1479
359
546
13
176
2398
2514
63
47
9
20
4
447
208
13
68
13
1191
13
1613
276
13
24
58
64
9
1287
13
384
36
22
153
1240
67
116
397
1613
201
13
321
1623
21
22
4
4
22
17
2398
20
1095
13
14
154
488
275
99
221
342
165
107
13
11
669
366
15
480
1293
25
169
1758
68
167
1177
13
175
374
99
190
255
200
800
311
58
1353
406
13
18
260
2
2
58
13
13
14
222
165
117
1279
14
14
18
189
158
116
13
25
663
223
165
388
326
1229
56
11
280
200
1498
13
14
22
99
800
11
18
386
69
140
397
17
312
283
22
1617
111
391
2
801
173
1430
1153
266
118
161
150
14
280
57
170
16
300
297
99
176
16
300
204
334
141
397
153
2255
239
3
2505
236
280
177
13
12
388
2
200
16
37
13
99
18
50
99
13
310
200
14
24
17
200
165
733
307
337
15
300
762
2
104
255
759
461
41
58
22
63
14
23
47
4
99
241
22
281
105
6
56
154
502
1125


In [None]:
# Iterate over the rows of the DataFrame
data = pd.read_csv('sncb_prepared.csv', sep=';')
print(len(data['events_sequence']))
counter = 0
for index, row in data.iterrows():
    # check if the numbers 2708, 4026, 4068, 4066 appear is in the events_sequence
    if '2708' in row['events_sequence'] and '4026' in row['events_sequence'] and '4068' in row['events_sequence'] and '4066' in row['events_sequence']:
        counter += 1

print(counter)

In [None]:
#change '[1 , 1]' in real list [1, 1]
string = '[1 , 1]'
print(list(map(int, string.strip('[]').split(','))))

[1, 1]
