In [12]:
import numpy as np
import pandas as pd
   
# Read CSV file
data = pd.read_csv('sncb_data_challenge.csv', sep=';')

#Convert string to list of integers
col_list = ['vehicles_sequence', 'events_sequence','seconds_to_incident_sequence']
for col in col_list:
    print(col)
    data[col] = data[col].apply(lambda x: list(map(int, x.strip('[]').split(','))))

#Convert string to list of floats
data['train_kph_sequence'] = data['train_kph_sequence'].apply(lambda x: list(map(float, x.strip('[]').split(','))))

# Print the type for each column
for col in data.columns:
    print(f"{col}: {type(data[col][0])}")

# Compute the acceleration
data['acceleration_seq'] = data.apply(
    lambda row: [
        (row['train_kph_sequence'][i + 1] - row['train_kph_sequence'][i]) / 
        (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i])
        if (row['seconds_to_incident_sequence'][i + 1] - row['seconds_to_incident_sequence'][i]) != 0 and row['vehicles_sequence'][i+1] == row['vehicles_sequence'][i] else np.nan
        for i in range(len(row['train_kph_sequence']) - 1)
    ], axis=1)

for i in range(len(data['events_sequence'])):
    new_vehicles_sequence = []
    new_events_sequence = []
    new_train_kph_sequence = []
    new_seconds_to_incident_sequence = []
    new_acceleration_seq = []
    
    for j in range(len(data['events_sequence'][i])):
        if j == 0 or data['events_sequence'][i][j] != data['events_sequence'][i][j-1]:
            new_vehicles_sequence.append(data['vehicles_sequence'][i][j])
            new_events_sequence.append(data['events_sequence'][i][j])
            new_train_kph_sequence.append(data['train_kph_sequence'][i][j])
            new_seconds_to_incident_sequence.append(data['seconds_to_incident_sequence'][i][j])
            if j < len(data['acceleration_seq'][i]):
                new_acceleration_seq.append(data['acceleration_seq'][i][j])
    
    data.at[i, 'vehicles_sequence'] = new_vehicles_sequence
    data.at[i, 'events_sequence'] = new_events_sequence
    data.at[i, 'train_kph_sequence'] = new_train_kph_sequence
    data.at[i, 'seconds_to_incident_sequence'] = new_seconds_to_incident_sequence
    data.at[i, 'acceleration_seq'] = new_acceleration_seq

for i in range(len(data['events_sequence'])):
    for j in range(len(data['events_sequence'][i]) - 1):
        if data['events_sequence'][i][j] == data['events_sequence'][i][j+1]:
            print("duplicates")
            print(i)
            print(len(data['events_sequence'][i]))
            print(j)
            raise ValueError("Duplicates in events_sequence")
    
# Save the modified DataFrame to a new CSV file
data.to_csv('sncb_prepared.csv', sep=';', index=False)

In [None]:
#Find the most frequent sequence of events for each type of incident in the dataset using the FP-Growth algorithm
data = pd.read_csv('sncb_prepared.csv', sep=';')

def fb_growth(data, min_support=0.1):
    # Create a dictionary to store the support of each item
    support = {}
    for index, row in data.iterrows():
        for event in row['events_sequence']:
            if event in support:
                support[event] += 1
            else:
                support[event] = 1

    # Filter the items that have a support greater than the minimum support
    frequent_items = {k: v for k, v in support.items() if v / len(data) >= min_support}

    # Create a dictionary to store the support of each itemset
    support = {}
    for index, row in data.iterrows():
        for i in range(len(row['events_sequence'])):
            for j in range(i + 1, len(row['events_sequence'])):
                if row['events_sequence'][i] in frequent_items and row['events_sequence'][j] in frequent_items:
                    if (row['events_sequence'][i], row['events_sequence'][j]) in support:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] += 1
                    else:
                        support[(row['events_sequence'][i], row['events_sequence'][j])] = 1

    # Filter the itemsets that have a support greater than the minimum support
    frequent_itemsets = {k: v for k, v in support.items() if v / len(data) >= min_support}

    return frequent_itemsets

print(fb_growth(data, min_support=0.1))

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
import ast  # For safely evaluating string representations of lists


def find_frequent_sequences_fp_growth(data, min_support=0.9):
    """
    Finds the most frequent sequences of events for each incident type using FP-Growth.
    """
    # Get all unique incident types
    incident_types = data['incident_type'].unique()
    results = {}

    # Convert stringified lists to actual lists of integers
    data['events_sequence'] = data['events_sequence'].apply(lambda x: ast.literal_eval(x))
    
    for incident in incident_types:
        # Filter rows for the current incident type
        filtered_data = data[data['incident_type'] == incident]

        # Prepare transactions: each transaction is a sequence of events
        transactions = filtered_data['events_sequence'].tolist()

        # Create a one-hot encoded DataFrame for the events
        unique_events = set(event for sequence in transactions for event in sequence)  # All unique events
        transaction_df = pd.DataFrame([
            {event: (event in sequence) for event in unique_events} for sequence in transactions
        ])
        print(transaction_df)
        # Apply FP-Growth algorithm
        # raise ValueError('stop')
        frequent_itemsets = fpgrowth(transaction_df, min_support=min_support, use_colnames=True)

        # Sort by support and keep top results
        if not frequent_itemsets.empty:
            most_frequent = frequent_itemsets.sort_values(by='support', ascending=False)
            results[incident] = most_frequent
        else:
            results[incident] = None
        break
        

    return results

# Load the dataset
data = pd.read_csv('sncb_prepared.csv', sep=';')

# Run the function
results = find_frequent_sequences_fp_growth(data)

# Display the results
for incident, frequent in results.items():
    print(f"Incident Type: {incident}")
    if frequent is not None:
        print(frequent)
    else:
        print("No frequent sequences found.")


     4100   4102   4104   4106   2058  4110   4112   4114   4120   4122  ...  \
0    True  False  False  False  False  True  False  False   True  False  ...   
1    True  False  False  False  False  True  False   True   True  False  ...   
2   False  False  False  False  False  True  False   True   True   True  ...   
3    True  False  False  False  False  True  False   True  False  False  ...   
4    True  False  False  False  False  True  False  False   True  False  ...   
..    ...    ...    ...    ...    ...   ...    ...    ...    ...    ...  ...   
73   True  False  False  False  False  True  False   True   True  False  ...   
74   True  False  False  False  False  True  False   True   True  False  ...   
75   True  False  False  False  False  True  False  False   True  False  ...   
76   True  False  False  False  False  True  False  False   True  False  ...   
77  False  False  False  False  False  True  False   True   True  False  ...   

     4070   4072   4076   4078   4080  

In [5]:
print(results)
# filtered_freq = results[4]
# filtered_freq = [(sup, item) for sup, item in zip(filtered_freq['support'], filtered_freq['itemsets']) if len(item) > 1]
# print(filtered_freq)

{4:      support                  itemsets
0   1.000000                    (4026)
8   1.000000              (4026, 2708)
1   1.000000                    (2708)
10  0.948718              (4026, 4148)
9   0.948718              (2708, 4148)
11  0.948718        (2708, 4026, 4148)
2   0.948718                    (4148)
5   0.935897                    (2742)
4   0.935897                    (4066)
3   0.935897                    (4068)
22  0.935897              (2708, 2742)
12  0.935897              (2708, 4068)
13  0.935897              (4026, 4068)
14  0.935897        (2708, 4026, 4068)
15  0.935897              (4066, 2708)
16  0.935897              (4026, 4066)
18  0.935897        (4026, 2708, 4066)
24  0.935897        (4026, 2708, 2742)
23  0.935897              (4026, 2742)
27  0.923077        (2708, 4026, 4394)
26  0.923077              (4026, 4394)
25  0.923077              (4394, 2708)
17  0.923077              (4066, 4068)
21  0.923077  (2708, 4026, 4068, 4066)
20  0.923077        (

In [17]:
# Iterate over the rows of the DataFrame

data = pd.read_csv('sncb_prepared.csv', sep=';')
print(len(data['events_sequence']))
counter = 0
for index, row in data.iterrows():
    # check if the numbers 2708, 4026, 4068, 4066 appear is in the events_sequence
    if '2708' in row['events_sequence'] and '4026' in row['events_sequence'] and '4068' in row['events_sequence'] and '4066' in row['events_sequence']:
        counter += 1

print(counter)

1011
919
