In [23]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori


In [24]:

# Read the cleaned fatal crash data
df = pd.read_excel('Coordinates_Imputed_Final.xlsx')

# Select relevant columns for association rule mining
cols_to_use = [
    'State', 'Crash Type', 'Day of week', 'Time of Day', 
    'Gender', 'Age Group', 'Road User', 'National Road Type',
    'National Remoteness Areas', 'Speed Limit'
]

# Create a subset with the selected columns
df_subset = df[cols_to_use].copy()

# Add a speed category
df_subset['Speed_Category'] = pd.cut(df_subset['Speed Limit'],
                                    bins=[0, 50, 80, 110, float('inf')],
                                    labels=['Low', 'Medium', 'High', 'Very High'])

# Drop the original speed limit to avoid redundancy
df_subset = df_subset.drop('Speed Limit', axis=1)

# Convert all data to string type for TransactionEncoder
df_subset = df_subset.astype(str)

# Create a list for TransactionEncoder
transactions = []
for i in range(df_subset.shape[0]):
    transactions.append([f"{col}_{val}" for col, val in zip(df_subset.columns, df_subset.iloc[i])])

# Use TransactionEncoder to convert to one-hot encoding
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
one_hot_df = pd.DataFrame(te_array, columns=te.columns_)


In [25]:

# Apply Apriori algorithm with higher min_support of 0.1

frequent_itemsets = apriori(one_hot_df, min_support=0.1, use_colnames=True)

# Add length of itemsets
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Filter itemsets to reduce complexity
filtered_itemsets = frequent_itemsets[(frequent_itemsets['length'] == 2)]


In [26]:

# Generate association rules manually with correct metrics

rules_list = []
for index, row in filtered_itemsets.iterrows():
    itemset = row['itemsets']
    itemset_support = row['support']
    
    # Only interested in itemsets of at least 2 items
    if len(itemset) < 2:
        continue
    
    # For each subset of the itemset as antecedent
    for i in range(1, len(itemset)):
        from itertools import combinations
        for antecedent_items in combinations(itemset, i):
            antecedent = frozenset(antecedent_items)
            consequent = itemset - antecedent
            
            # Only interested in rules with "Road User" in consequent
            if not any('Road User_' in str(item) for item in consequent):
                continue
            
            # Calculate support of antecedent (joint probability of all items in antecedent)
            antecedent_mask = np.ones(len(one_hot_df), dtype=bool)
            for item in antecedent:
                antecedent_mask &= one_hot_df[item].astype(bool).values
            antecedent_support = antecedent_mask.mean()
            
            # Calculate support of consequent
            consequent_mask = np.ones(len(one_hot_df), dtype=bool)
            for item in consequent:
                consequent_mask &= one_hot_df[item].astype(bool).values
            consequent_support = consequent_mask.mean()
            
            # Calculate confidence: P(consequent|antecedent) = P(antecedent, consequent) / P(antecedent)
            confidence = itemset_support / antecedent_support if antecedent_support > 0 else 0
            
            # Calculate lift: confidence / P(consequent)
            lift = confidence / consequent_support if consequent_support > 0 else 0
            
            # Format metrics to 4 decimal places
            itemset_support = round(itemset_support, 4)
            confidence = round(confidence, 4)
            lift = round(lift, 4)
            
            # Include all rules
            rules_list.append({
                'antecedents': antecedent,
                'consequents': consequent,
                'support': itemset_support,
                'confidence': confidence,
                'lift': lift
            })


In [30]:

# Convert to DataFrame
if rules_list:
    rules_df = pd.DataFrame(rules_list)
    
    # Sort by lift (primary), confidence (secondary), and support (tertiary)
    sorted_rules = rules_df.sort_values(['lift', 'confidence', 'support'], ascending=False)
    
    # Function to make rules more readable
    def format_rule(row):
        # Convert frozensets to lists
        antecedents_list = list(row['antecedents'])
        consequents_list = list(row['consequents'])
        
        # Clean up the formatting (remove the column prefix for readability)
        antecedents_clean = [item.split('_', 1)[1] if '_' in item else item for item in antecedents_list]
        consequents_clean = [item.split('_', 1)[1] if '_' in item else item for item in consequents_list]
        
        return {
            'antecedents': ', '.join(antecedents_clean),
            'consequents': ', '.join(consequents_clean),
            'support': row['support'],
            'confidence': row['confidence'],
            'lift': row['lift']
        }
    
    # Apply the formatting function and convert to DataFrame
    readable_rules = [format_rule(row) for _, row in sorted_rules.iterrows()]
    result_df = pd.DataFrame(readable_rules)
    
    # Display the top rules
    k = min(4, len(result_df))
    print(f"\nTop {k} Rules with 'Road User' as Consequent (sorted by lift, confidence, support):")
    print(result_df.head(k))
    
    # Save to CSV for reporting
    result_df.to_csv('top_road_user_rules.csv', index=False)
    
else:
    print("No rules found with 'Road User' in the consequent.")
    print("Try adjusting the min_support parameter.")

print("Association rule mining analysis complete.")


Top 4 Rules with 'Road User' as Consequent (sorted by lift, confidence, support):
  antecedents       consequents  support  confidence    lift
0      Female         Passenger   0.1043      0.3726  1.6788
1        Male  Motorcycle rider   0.1292      0.1794  1.3370
2       40-64            Driver   0.1353      0.5085  1.1125
3       26-39            Driver   0.1176      0.5054  1.1057
Association rule mining analysis complete.
