In [12]:
import numpy as np
import pandas as pd
   
# Read CSV file
data = pd.read_csv('sncb_data_challenge.csv', sep=';')

#Convert string to list of integers
col_list = ['vehicles_sequence', 'events_sequence','seconds_to_incident_sequence']
for col in col_list:
    print(col)
    data[col] = data[col].apply(lambda x: list(map(int, x.strip('[]').split(','))))

#Convert string to list of floats
data['train_kph_sequence'] = data['train_kph_sequence'].apply(lambda x: list(map(float, x.strip('[]').split(','))))

# Print the type for each column
for col in data.columns:
    print(f"{col}: {type(data[col][0])}")

# Compute the acceleration
data['acceleration_seq'] = data.apply(
    lambda row: [
        (row['train_kph_sequence'][i + 1] - row['train_kph_sequence'][i]) / 
        (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i])
        if (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i]) != 0 and row['vehicles_sequence'][i+1] == row['vehicles_sequence'][i] else np.nan
        for i in range(len(row['train_kph_sequence']) - 1)
    ], axis=1)

for i in range(len(data['events_sequence'])):
    new_vehicles_sequence = []
    new_events_sequence = []
    new_train_kph_sequence = []
    new_seconds_to_incident_sequence = []
    new_acceleration_seq = []
    
    for j in range(len(data['events_sequence'][i])):
        if j == 0 or data['events_sequence'][i][j] != data['events_sequence'][i][j-1]:
            new_vehicles_sequence.append(data['vehicles_sequence'][i][j])
            new_events_sequence.append(data['events_sequence'][i][j])
            new_train_kph_sequence.append(data['train_kph_sequence'][i][j])
            new_seconds_to_incident_sequence.append(data['seconds_to_incident_sequence'][i][j])
            if j < len(data['acceleration_seq'][i]):
                new_acceleration_seq.append(data['acceleration_seq'][i][j])
    
    data.at[i, 'vehicles_sequence'] = new_vehicles_sequence
    data.at[i, 'events_sequence'] = new_events_sequence
    data.at[i, 'train_kph_sequence'] = new_train_kph_sequence
    data.at[i, 'seconds_to_incident_sequence'] = new_seconds_to_incident_sequence
    data.at[i, 'acceleration_seq'] = new_acceleration_seq

for i in range(len(data['events_sequence'])):
    for j in range(len(data['events_sequence'][i]) - 1):
        if data['events_sequence'][i][j] == data['events_sequence'][i][j+1]:
            print("duplicates")
            print(i)
            print(len(data['events_sequence'][i]))
            print(j)
            raise ValueError("Duplicates in events_sequence")
    
# Save the modified DataFrame to a new CSV file
data.to_csv('sncb_prepared.csv', sep=';', index=False)

vehicles_sequence
events_sequence
seconds_to_incident_sequence
Unnamed: 0: <class 'numpy.int64'>
incident_id: <class 'numpy.int64'>
vehicles_sequence: <class 'list'>
events_sequence: <class 'list'>
seconds_to_incident_sequence: <class 'list'>
approx_lat: <class 'numpy.float64'>
approx_lon: <class 'numpy.float64'>
train_kph_sequence: <class 'list'>
dj_ac_state_sequence: <class 'str'>
dj_dc_state_sequence: <class 'str'>
incident_type: <class 'numpy.int64'>


In [None]:
#Find the most frequent sequence of events for each type of incident in the dataset using the FP-Growth algorithm
data = pd.read_csv('sncb_prepared.csv', sep=';')

def fb_growth(data, min_support=0.1):
    # Create a dictionary to store the support of each item
    support = {}
    for index, row in data.iterrows():
        for event in row['events_sequence']:
            if event in support:
                support[event] += 1
            else:
                support[event] = 1

    # Filter the items that have a support greater than the minimum support
    frequent_items = {k: v for k, v in support.items() if v / len(data) >= min_support}

    # Create a dictionary to store the support of each itemset
    support = {}
    for index, row in data.iterrows():
        for i in range(len(row['events_sequence'])):
            for j in range(i + 1, len(row['events_sequence'])):
                if row['events_sequence'][i] in frequent_items and row['events_sequence'][j] in frequent_items:
                    if (row['events_sequence'][i], row['events_sequence'][j]) in support:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] += 1
                    else:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] = 1

    # Filter the itemsets that have a support greater than the minimum support
    frequent_itemsets = {k: v for k, v in support.items() if v / len(data) >= min_support}

    return frequent_itemsets

print(fb_growth(data, min_support=0.1))

{(2744, 4004): 1801, (2744, 2852): 5323, (2744, 4110): 3462, (2744, 2854): 5334, (2744, 4396): 5841, (2744, 4140): 9191, (2744, 4148): 9783, (2744, 2708): 16313, (2744, 4026): 26064, (2744, 4152): 3732, (2744, 4030): 5191, (2744, 4018): 3070, (2744, 4168): 10546, (2744, 4156): 3856, (2744, 4394): 5594, (2744, 152): 678, (2744, 2742): 5532, (2744, 4410): 1979, (2744, 4406): 1990, (2744, 4068): 24205, (2744, 4408): 1974, (2744, 4412): 1974, (2744, 4066): 29852, (2744, 2744): 6553, (2744, 3986): 3369, (2744, 4002): 1649, (2744, 4124): 14859, (2744, 2858): 2070, (2744, 2658): 2263, (2744, 2688): 2245, (2744, 3254): 3282, (2744, 2970): 1681, (2744, 4082): 1934, (2744, 4090): 2817, (2744, 4092): 2488, (2744, 3236): 1721, (2744, 4100): 1949, (2744, 2980): 451, (2744, 4120): 8440, (2744, 4084): 1287, (2744, 4094): 2465, (2744, 1286): 511, (2744, 3492): 225, (2744, 3224): 548, (2744, 4126): 1166, (2744, 2684): 1192, (2744, 4022): 801, (2744, 2654): 251, (2744, 4392): 1167, (2744, 1200): 227, (2

In [16]:
#Find the most frequent sequence of events for each type of incident in the dataset using the apriori algorithm
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
data = pd.read_csv('sncb_prepared.csv', sep=';')

# Function to process the data for each incident type
def process_incident_type(data, min_support=0.1):
    # Group data by incident type
    incident_types = data['incident_type'].unique()
    results = {}

    for incident in incident_types:
        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each row as a sequence of events
        transactions = filtered_data['events_sequence'].tolist()

        # Create a DataFrame where each event becomes a column
        # Represent presence of events in a one-hot encoded format
        event_set = set(event for row in transactions for event in row)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in row) for event in event_set} for row in transactions
        ])

        # Apply Apriori to find frequent itemsets
        frequent_itemsets = apriori(transaction_df, min_support=min_support, use_colnames=True)

        # Find the most frequent sequence(s)
        if not frequent_itemsets.empty:
            most_frequent = frequent_itemsets.sort_values(by='support', ascending=False).head(1)
            results[incident] = most_frequent
        else:
            results[incident] = None

    return results

# Run the function
results = process_incident_type(data, min_support=0.1)

# Display the results
for incident, frequent in results.items():
    print(f"Incident Type: {incident}")
    if frequent is not None:
        print(frequent)
    else:
        print("No frequent sequences found.")


ModuleNotFoundError: No module named 'mlxtend'