In [2]:

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load cleaned Excel data
df = pd.read_excel("bitre_fatalities_dec2024.xlsx", sheet_name="BITRE_Fatality", header=4)
df = df.replace("-9", "Unknown").fillna("Unknown")

# Select relevant categorical columns
selected_columns = [
    "Crash Type", "Gender", "Age Group", "Time of day",
    "Christmas Period", "Easter Period", "National Road Type",
    "Bus Involvement", "Road User"
]

df_arm = df[selected_columns]

# Convert rows into transactions like ["Crash Type=Single", "Gender=Male", ...]
transactions = df_arm.apply(lambda row: [f"{col}={row[col]}" for col in df_arm.columns], axis=1).tolist()

# One-hot encode transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Generate frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.02, use_colnames=True)

# Generate rules with confidence and lift
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Filter for rules where "Road User=..." is in the consequent (RHS)
road_user_rules = rules[rules['consequents'].apply(lambda x: any("Road User=" in item for item in x))]

# Rank by lift and confidence
top_rules = road_user_rules.sort_values(by=["lift", "confidence"], ascending=False)

# Display top rules
top_rules.head(10)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
38043,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Bus Involvement=No, Eas...",0.038383,0.144249,0.022699,0.591388,4.099781,1.0,0.017163,2.094288,0.786264,0.141931,0.522511,0.374375
38037,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Bus Involvement=No, Chr...",0.03826,0.145075,0.022699,0.59329,4.089541,1.0,0.017149,2.102053,0.785528,0.141309,0.524275,0.374878
26515,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Bus Involvement=No, Chr...",0.038383,0.145075,0.02277,0.59322,4.089058,1.0,0.017201,2.10169,0.785599,0.1417,0.524193,0.375086
27136,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Easter Period=No, Chris...",0.038383,0.149277,0.023385,0.609253,4.081351,1.0,0.017655,2.177172,0.785118,0.142353,0.540689,0.382954
38036,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Bus Involvement=No, Eas...",0.037592,0.148064,0.022699,0.603835,4.078201,1.0,0.017133,2.150459,0.784276,0.139297,0.534983,0.378571
27131,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Christmas Period=No)",0.03826,0.150104,0.023385,0.611213,4.071939,1.0,0.017642,2.186022,0.784429,0.141746,0.542548,0.383503
26919,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Bus Involvement=No, Eas...",0.038383,0.148064,0.023139,0.60284,4.07148,1.0,0.017456,2.14507,0.784501,0.141688,0.533815,0.379558
13580,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Christmas Period=No)",0.038383,0.150104,0.023455,0.611086,4.071089,1.0,0.017694,2.185305,0.784476,0.142127,0.542398,0.383673
38024,"(Christmas Period=No, Easter Period=No, Nation...","(Road User=Pedestrian, Bus Involvement=No)",0.037469,0.148891,0.022699,0.605819,4.068888,1.0,0.017121,2.159184,0.783593,0.138698,0.536862,0.379138
26510,"(Age Group=75_or_older, National Road Type=Und...","(Road User=Pedestrian, Bus Involvement=No)",0.037592,0.148891,0.02277,0.605706,4.068132,1.0,0.017173,2.158567,0.783646,0.139083,0.53673,0.379317
