In [None]:
#

In [None]:
# Data preparation script for the SNCB Data Challenge

import numpy as np
import pandas as pd
   
# Read CSV file
data = pd.read_csv('sncb_data_challenge.csv', sep=';')

#Convert string to list of integers
col_list = ['vehicles_sequence', 'events_sequence','seconds_to_incident_sequence']
for col in col_list:
    print(col)
    data[col] = data[col].apply(lambda x: list(map(int, x.strip('[]').split(','))))

#Convert string to list of floats
data['train_kph_sequence'] = data['train_kph_sequence'].apply(lambda x: list(map(float, x.strip('[]').split(','))))

# Print the type for each column
for col in data.columns:
    print(f"{col}: {type(data[col][0])}")

# Compute the acceleration
data['acceleration_seq'] = data.apply(
    lambda row: [
        (row['train_kph_sequence'][i + 1] - row['train_kph_sequence'][i]) / 
        (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i])
        if (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i]) != 0 and row['vehicles_sequence'][i+1] == row['vehicles_sequence'][i] else np.nan
        for i in range(len(row['train_kph_sequence']) - 1)
    ], axis=1)

for i in range(len(data['events_sequence'])):
    new_vehicles_sequence = []
    new_events_sequence = []
    new_train_kph_sequence = []
    new_seconds_to_incident_sequence = []
    new_acceleration_seq = []
    
    for j in range(len(data['events_sequence'][i])):
        if j == 0 or data['events_sequence'][i][j] != data['events_sequence'][i][j-1]:
            new_vehicles_sequence.append(data['vehicles_sequence'][i][j])
            new_events_sequence.append(data['events_sequence'][i][j])
            new_train_kph_sequence.append(data['train_kph_sequence'][i][j])
            new_seconds_to_incident_sequence.append(data['seconds_to_incident_sequence'][i][j])
            if j < len(data['acceleration_seq'][i]):
                new_acceleration_seq.append(data['acceleration_seq'][i][j])
    
    data.at[i, 'vehicles_sequence'] = new_vehicles_sequence
    data.at[i, 'events_sequence'] = new_events_sequence
    data.at[i, 'train_kph_sequence'] = new_train_kph_sequence
    data.at[i, 'seconds_to_incident_sequence'] = new_seconds_to_incident_sequence
    data.at[i, 'acceleration_seq'] = new_acceleration_seq

for i in range(len(data['events_sequence'])):
    for j in range(len(data['events_sequence'][i]) - 1):
        if data['events_sequence'][i][j] == data['events_sequence'][i][j+1]:
            print("duplicates")
            print(i)
            print(len(data['events_sequence'][i]))
            print(j)
            raise ValueError("Duplicates in events_sequence")
    
# Save the modified DataFrame to a new CSV file
data.to_csv('sncb_prepared.csv', sep=';', index=False)

In [None]:
#Find the most frequent sequence of events for each type of incident in the dataset using the FP-Growth algorithm
data = pd.read_csv('sncb_prepared.csv', sep=';')

def fb_growth(data, min_support=0.1):
    # Create a dictionary to store the support of each item
    support = {}
    for index, row in data.iterrows():
        for event in row['events_sequence']:
            if event in support:
                support[event] += 1
            else:
                support[event] = 1

    # Filter the items that have a support greater than the minimum support
    frequent_items = {k: v for k, v in support.items() if v / len(data) >= min_support}

    # Create a dictionary to store the support of each itemset
    support = {}
    for index, row in data.iterrows():
        for i in range(len(row['events_sequence'])):
            for j in range(i + 1, len(row['events_sequence'])):
                if row['events_sequence'][i] in frequent_items and row['events_sequence'][j] in frequent_items:
                    if (row['events_sequence'][i], row['events_sequence'][j]) in support:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] += 1
                    else:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] = 1

    # Filter the itemsets that have a support greater than the minimum support
    frequent_itemsets = {k: v for k, v in support.items() if v / len(data) >= min_support}

    return frequent_itemsets

print(fb_growth(data, min_support=0.1))

In [3]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
import ast  # For safely evaluating string representations of lists


def find_frequent_sequences_fp_growth(data, min_support=0.9):
    """
    Finds the most frequent sequences of events for each incident type using FP-Growth.
    """
    # Get all unique incident types
    incident_types = data['incident_type'].unique()
    results = {}

    # Convert stringified lists to actual lists of integers
    data['events_sequence'] = data['events_sequence'].apply(lambda x: ast.literal_eval(x))
     
    for incident in incident_types:
        # Check if the csv file already exists
        try:
            most_frequent = pd.read_csv(f'results\\results_{incident}.csv', sep=';')
            results[incident] = most_frequent
            continue
        except:
            pass

        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each transaction is a sequence of events
        transactions = filtered_data['events_sequence'].tolist()

        # Create a one-hot encoded DataFrame for the events
        unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in sequence) for event in unique_events} for sequence in transactions
        ])
        # Apply FP-Growth algorithm
        frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)

        # Sort by support and keep top results
        if not frequent_itemsets.empty:
            most_frequent = frequent_itemsets.sort_values(by='support', ascending=False)
            most_frequent['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: list(x))
            results[incident] = most_frequent
        else:
            results[incident] = None
        
        # store the results in a csv file
        most_frequent.to_csv(f'results\\results_{incident}.csv', sep=';', index=False)
    # Run for all the database
    transactions = data['events_sequence'].tolist()
    unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
    transaction_df = pd.DataFrame([
        {event: (event in sequence) for event in unique_events} for sequence in transactions
    ])
    database_frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)
    database_frequent_itemsets = database_frequent_itemsets.sort_values(by='support', ascending=False)
    database_frequent_itemsets['itemsets'] = database_frequent_itemsets['itemsets'].apply(lambda x: list(x))
    database_frequent_itemsets.to_csv(f'results\\results_database.csv', sep=';', index=False)
    return results

# Load the dataset
data = pd.read_csv('sncb_prepared.csv', sep=';')

# Run the function
results = find_frequent_sequences_fp_growth(data)

# Display the results
for incident, frequent in results.items():
    print(f"Incident Type: {incident}")
    if frequent is not None:
        print(frequent)
    else:
        print("No frequent sequences found.")


Incident Type: 4
     support                  itemsets
0   1.000000                    [4026]
8   1.000000              [4026, 2708]
1   1.000000                    [2708]
10  0.948718              [4026, 4148]
9   0.948718              [2708, 4148]
11  0.948718        [2708, 4026, 4148]
2   0.948718                    [4148]
5   0.935897                    [2742]
4   0.935897                    [4066]
3   0.935897                    [4068]
22  0.935897              [2708, 2742]
12  0.935897              [2708, 4068]
13  0.935897              [4026, 4068]
14  0.935897        [2708, 4026, 4068]
15  0.935897              [4066, 2708]
16  0.935897              [4026, 4066]
18  0.935897        [4026, 2708, 4066]
24  0.935897        [4026, 2708, 2742]
23  0.935897              [4026, 2742]
27  0.923077        [2708, 4026, 4394]
26  0.923077              [4026, 4394]
25  0.923077              [4394, 2708]
17  0.923077              [4066, 4068]
21  0.923077  [2708, 4026, 4068, 4066]
20  0.92

In [None]:
import os
import pandas as pd

itemsets_to_not_add = pd.read_csv('results\\results_database.csv', sep=';')

frequent_itmesets = []
# read all the files in the results folder
for filename in os.listdir('results'):
    if filename == 'results_database.csv':
        continue
    incident_frequent_itemset = pd.read_csv(f'results\\{filename}', sep=';')
    for index, row in incident_frequent_itemset.iterrows():
        if row['itemsets'] not in frequent_itmesets and row['itemsets'] not in itemsets_to_not_add['itemsets']:
            frequent_itmesets.append(row['itemsets'])

frequent_itmesets = pd.DataFrame(frequent_itmesets, columns=['itemsets'])
frequent_itmesets.to_csv('results\\results_frequent.csv', sep=';', index=False)

In [None]:
import pandas as pd
import time
# Load the dataset
data = pd.read_csv('sncb_prepared.csv', sep=';')
frequent_itmesets = pd.read_csv('results\\results_frequent.csv', sep=';')
columns = frequent_itmesets
# print(columns)
columns.append('incident_type')
final_data = pd.DataFrame(columns=columns)
# final_data.columns.append('incident_type')
print(final_data.columns)
raise ValueError("stop")
# Apply the one hot encoding
for index, row in data.iterrows():
    start = time.time()
    one_hot_encoding = []
    for itemset in frequent_itmesets['itemsets']:
        if all(event in row['events_sequence'] for event in itemset):
            one_hot_encoding.append(1)
        else:
            one_hot_encoding.append(0)
    one_hot_encoding.append(row['incident_type'])
    final_data = final_data.add(pd.DataFrame([one_hot_encoding], columns=columns), fill_value=0)
    print(time.time() - start)

final_data.to_csv('sncb_final.csv', sep=';', index=False)

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
# Iterate over the rows of the DataFrame
data = pd.read_csv('sncb_prepared.csv', sep=';')
print(len(data['events_sequence']))
counter = 0
for index, row in data.iterrows():
    # check if the numbers 2708, 4026, 4068, 4066 appear is in the events_sequence
    if '2708' in row['events_sequence'] and '4026' in row['events_sequence'] and '4068' in row['events_sequence'] and '4066' in row['events_sequence']:
        counter += 1

print(counter)