In [2]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   --------------- ------------------------ 0.5/1.4 MB 1.1 MB/s eta 0:00:01
   ------------------------------- -------- 1.0/1.4 MB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 1.6 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load crash data with correct headers
crashes_df = pd.read_excel(
    r"D:\UWA\Data_Warehousing\Project 1\bitre_fatal_crashes_dec2024.xlsx",
    sheet_name="BITRE_Fatal_Crash",
    header=4
)

# Clean and rename columns (fix newline and extra whitespace)
crashes_df.columns = [str(col).strip().replace("\n", " ").replace("  ", " ") for col in crashes_df.columns]

# Filter useful columns for mining
cols = [
    "Crash Type",
    "Number Fatalities",
    "Bus Involvement",
    "Heavy Rigid Truck Involvement",
    "Articulated Truck Involvement",
    "National Remoteness Areas",
    "National Road Type",
    "Day of week",
    "Time of Day"
]
df = crashes_df[cols].dropna()

# Encode fatal vs non-fatal
df["Fatality"] = df["Number Fatalities"].astype(int).apply(lambda x: "Fatal" if x > 0 else "Non-Fatal")
df = df.drop(columns=["Number Fatalities"])

# Convert rows to transactions
transactions = df.astype(str).values.tolist()

# Transaction encoder
te = TransactionEncoder()
te_data = te.fit_transform(transactions)
df_encoded = pd.DataFrame(te_data, columns=te.columns_)

# Apply Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# Focus on fatal crashes
fatal_rules = rules[rules['consequents'].astype(str).str.contains("Fatal")]

# Top rules by lift/confidence
top_rules = fatal_rules.sort_values(by=["lift", "confidence"], ascending=False).head(10)

# Save result
top_rules.to_csv("top_fatal_crash_rules.csv", index=False)

# Display result
print("Top Association Rules with 'Fatal' as Consequent:")
print(top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Top Association Rules with 'Fatal' as Consequent:
                       antecedents                          consequents  \
2080                    (Yes, Day)           (Fatal, Weekday, Multiple)   
4011                (Yes, No, Day)           (Fatal, Weekday, Multiple)   
4016                    (Yes, Day)       (Fatal, Weekday, No, Multiple)   
4059           (Unknown, Yes, Day)      (Fatal, Undetermined, Multiple)   
5409       (Unknown, Yes, No, Day)      (Fatal, Undetermined, Multiple)   
5423           (Unknown, Yes, Day)  (Fatal, No, Undetermined, Multiple)   
4061      (Yes, Day, Undetermined)           (Fatal, Unknown, Multiple)   
5416  (Yes, No, Day, Undetermined)           (Fatal, Unknown, Multiple)   
5425      (Yes, Day, Undetermined)       (Fatal, Unknown, No, Multiple)   
4532       (Weekday, Unknown, Yes)      (Fatal, Undetermined, Multiple)   

       support  confidence      lift  
2080  0.060799    0.658779  2.377036  
4011  0.060487    0.658180  2.374874  
4016  0

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
