# ***Import necessary packages***

In [1]:
import pandas as pd
import itertools
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

# ***Scratch Code***

In [2]:
# Settings
MIN_SUPPORT = 0.4
MIN_CONFIDENCE = 0.6

In [3]:
def get_transactions_for_cluster(df, cluster_id):

    df_c = df[df['cluster'] == cluster_id].copy()

    transactions = []
    for _, row in df_c.iterrows():
        transaction = []

        # --- Add Categorical Attributes (Column=Value) ---
        categorical_cols = ['TIME_OF_DAY', 'WEEKEND_FLAG', 'SEASON', 'AGE_GROUP']
        for col in categorical_cols:
            if pd.notna(row[col]):
                transaction.append(f"{col}={row[col]}")

        # --- Add Binary Attributes (Column Name if 1) ---
        binary_cols = [
            'RUSH_HOUR', 'MALE', 'ADVERSE_WEATHER', 'DARK_CONDITIONS',
            'OLD_VEHICLE', 'PASSENGER_CAR', 'LARGE_TRUCK', 'MOTORCYCLE',
            'URBAN', 'INTERSTATE', 'INTERSECTION', 'WORK_ZONE_CRASH',
            'ROLLOVER_CRASH', 'FIRE'
        ]

        for col in binary_cols:
            if row[col] == 1:
                transaction.append(col)

        transactions.append(transaction)

    return transactions


In [4]:
def create_C1(dataset):

    C1 = []
    for transaction in dataset:
        for item in transaction:
            if [item] not in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))

def scan_D(dataset, candidates, min_support):

    ss_cnt = {}
    for tid in dataset:
        for can in candidates:
            if can.issubset(tid):
                if can not in ss_cnt: ss_cnt[can] = 1
                else: ss_cnt[can] += 1

    num_items = float(len(dataset))
    ret_list = []
    support_data = {}

    for key in ss_cnt:
        support = ss_cnt[key] / num_items
        if support >= min_support:
            ret_list.insert(0, key)
        support_data[key] = support
    return ret_list, support_data

def apriori_gen(Lk, k):

    ret_list = []
    len_Lk = len(Lk)
    for i in range(len_Lk):
        for j in range(i + 1, len_Lk):
            L1 = list(Lk[i])[:k-2]
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1 == L2: # Join step
                ret_list.append(Lk[i] | Lk[j])
    return ret_list

In [5]:
def apriori_scratch(transactions, min_support=MIN_SUPPORT):
    # Convert transactions to set for faster lookup
    D = list(map(set, transactions))

    # Generate L1 (Frequent 1-itemsets)
    C1 = create_C1(transactions)
    L1, support_data = scan_D(D, C1, min_support)
    L = [L1]

    k = 2
    while (len(L[k-2]) > 0):
        Ck = apriori_gen(L[k-2], k)
        Lk, supK = scan_D(D, Ck, min_support)
        support_data.update(supK)
        if len(Lk) == 0:
            break
        L.append(Lk)
        k += 1

    return L, support_data

In [6]:
def generate_rules_scratch(L, support_data, min_confidence=MIN_CONFIDENCE):
    big_rule_list = []

    for i in range(1, len(L)): # Start from L2 (2-itemsets)
        for freq_set in L[i]:
            H1 = [frozenset([item]) for item in freq_set]

            # 1. Always calculate confidence for 1-item consequents first
            H1 = calc_conf(freq_set, H1, support_data, big_rule_list, min_confidence)

            # 2. If itemset has more than 2 items AND we have valid consequents, recurse
            if i > 1 and len(H1) > 0:
                rules_from_conseq(freq_set, H1, support_data, big_rule_list, min_confidence)

    return big_rule_list

def calc_conf(freq_set, H, support_data, brl, min_conf=MIN_CONFIDENCE):
    pruned_H = []
    for conseq in H:
        conf = support_data[freq_set] / support_data[freq_set - conseq]
        if conf >= min_conf:
            lift = conf / support_data[conseq]
            # Format: (antecedents, consequents, confidence, lift, support)
            brl.append((freq_set - conseq, conseq, conf, lift, support_data[freq_set]))
            pruned_H.append(conseq)
    return pruned_H

def rules_from_conseq(freq_set, H, support_data, brl, min_conf=MIN_CONFIDENCE):
    m = len(H[0])
    if (len(freq_set) > (m + 1)):
        Hmp1 = apriori_gen(H, m + 1)
        Hmp1 = calc_conf(freq_set, Hmp1, support_data, brl, min_conf)
        if (len(Hmp1) > 1):
            rules_from_conseq(freq_set, Hmp1, support_data, brl, min_conf)

# ***MLXTEND Pre-Built (cross-verification of resultst)***

In [7]:
def verify_mlxtend(transactions, min_support=MIN_SUPPORT, min_confidence=MIN_CONFIDENCE):

    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)


    frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)

    if frequent_itemsets.empty:
        return pd.DataFrame()


    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

    if rules.empty:
        return pd.DataFrame()


    rules = rules.sort_values(by='lift', ascending=False)


    display_df = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()
    display_df['antecedents'] = display_df['antecedents'].apply(lambda x: ", ".join(list(x)))
    display_df['consequents'] = display_df['consequents'].apply(lambda x: ", ".join(list(x)))
    display_df.columns = ['Antecedents', 'Consequents', 'Support', 'Confidence', 'Lift']

    return display_df.round(3)

# ***Testing on Each Clusters to find rules and relations***

In [8]:
df = pd.read_parquet('/content/fatal_accident_clusters_v2.parquet')
unique_clusters = df['cluster'].unique()

print("Found Clusters:", unique_clusters)

Found Clusters: [2 5 0 1 3 4]


In [9]:
def analyze_cluster_rules(cluster_id, data=df, min_sup=MIN_SUPPORT, min_conf=MIN_CONFIDENCE):

    print('='*60)
    print(f"ANALYZING CLUSTER: {cluster_id}")
    print('='*60)

    # Fetch transactions using your existing helper function
    transactions = get_transactions_for_cluster(data, cluster_id)

    print(f"Transactions: {len(transactions)}")

    # --- 1. Scratch Implementation ---

    L, sup_data = apriori_scratch(transactions, min_support=min_sup)
    rules = generate_rules_scratch(L, sup_data, min_confidence=min_conf)

    print("\n--- [SCRATCH] Top 15 Rules ---\n")
    if rules:
        # Sort by Lift (index 3) descending
        rules.sort(key=lambda x: x[3], reverse=True)

        # Create DF
        df_s = pd.DataFrame(rules, columns=['Antecedents', 'Consequents', 'Confidence', 'Lift', 'Support'])

        # Format sets to strings for display
        df_s['Antecedents'] = df_s['Antecedents'].apply(lambda x: ", ".join(list(x)))
        df_s['Consequents'] = df_s['Consequents'].apply(lambda x: ", ".join(list(x)))

        # Display specific column order
        print(df_s[['Antecedents', 'Consequents', 'Support', 'Confidence', 'Lift']].head(15).round(3).to_string(index=False))
    else:
        print("No rules found (Scratch).")

    # --- 2. Mlxtend Verification ---

    print("\n--- [MLXTEND] Top 15 Rules ---\n")

    df_m = verify_mlxtend(transactions, min_support=min_sup, min_confidence=min_conf)

    if not df_m.empty:
        print(df_m.head(15).to_string(index=False))
    else:
        print("No rules found (Mlxtend).")

In [10]:
analyze_cluster_rules(0)

ANALYZING CLUSTER: 0
Transactions: 26348

--- [SCRATCH] Top 15 Rules ---

                Antecedents                Consequents  Support  Confidence  Lift
                      URBAN                       MALE    0.504       0.761 1.024
                       MALE                      URBAN    0.504       0.679 1.024
 WEEKEND_FLAG=Weekday, MALE                      URBAN    0.401       0.678 1.023
                      URBAN WEEKEND_FLAG=Weekday, MALE    0.401       0.605 1.023
              PASSENGER_CAR                      URBAN    0.484       0.674 1.017
                      URBAN              PASSENGER_CAR    0.484       0.730 1.017
                      URBAN            AGE_GROUP=Adult    0.431       0.650 1.015
            AGE_GROUP=Adult                      URBAN    0.431       0.673 1.015
                       MALE            AGE_GROUP=Adult    0.480       0.646 1.009
            AGE_GROUP=Adult                       MALE    0.480       0.750 1.009
       WEEKEND_FLAG=Week

In [11]:
analyze_cluster_rules(1)

ANALYZING CLUSTER: 1
Transactions: 5653

--- [SCRATCH] Top 15 Rules ---

                          Antecedents                           Consequents  Support  Confidence  Lift
AGE_GROUP=Adult, WEEKEND_FLAG=Weekday                             RUSH_HOUR    0.468       0.739 1.066
                            RUSH_HOUR AGE_GROUP=Adult, WEEKEND_FLAG=Weekday    0.468       0.674 1.066
                 WEEKEND_FLAG=Weekday                             RUSH_HOUR    0.693       0.735 1.060
                            RUSH_HOUR                  WEEKEND_FLAG=Weekday    0.693       1.000 1.060
               OLD_VEHICLE, RUSH_HOUR                  WEEKEND_FLAG=Weekday    0.410       1.000 1.060
           AGE_GROUP=Adult, RUSH_HOUR                  WEEKEND_FLAG=Weekday    0.468       1.000 1.060
                      RUSH_HOUR, MALE                  WEEKEND_FLAG=Weekday    0.552       1.000 1.060
           WEEKEND_FLAG=Weekday, MALE                             RUSH_HOUR    0.552       0.715 1.031


In [12]:
analyze_cluster_rules(2)

ANALYZING CLUSTER: 2
Transactions: 9296

--- [SCRATCH] Top 15 Rules ---

                             Antecedents                            Consequents  Support  Confidence  Lift
  AGE_GROUP=Adult, DARK_CONDITIONS, MALE                      TIME_OF_DAY=Night    0.401       0.754 1.191
                       TIME_OF_DAY=Night AGE_GROUP=Adult, DARK_CONDITIONS, MALE    0.401       0.634 1.191
      AGE_GROUP=Adult, TIME_OF_DAY=Night                  DARK_CONDITIONS, MALE    0.401       0.900 1.180
                   DARK_CONDITIONS, MALE                      TIME_OF_DAY=Night    0.569       0.746 1.179
                       TIME_OF_DAY=Night                  DARK_CONDITIONS, MALE    0.569       0.899 1.179
                 MALE, TIME_OF_DAY=Night       AGE_GROUP=Adult, DARK_CONDITIONS    0.401       0.703 1.176
        AGE_GROUP=Adult, DARK_CONDITIONS                MALE, TIME_OF_DAY=Night    0.401       0.671 1.176
        AGE_GROUP=Adult, DARK_CONDITIONS                      TIME_OF_D

In [13]:
analyze_cluster_rules(3)

ANALYZING CLUSTER: 3
Transactions: 4804

--- [SCRATCH] Top 15 Rules ---

                     Antecedents                Consequents  Support  Confidence  Lift
TIME_OF_DAY=Evening, URBAN, MALE            DARK_CONDITIONS    0.415       0.871 1.208
      TIME_OF_DAY=Evening, URBAN            DARK_CONDITIONS    0.437       0.868 1.205
                 DARK_CONDITIONS TIME_OF_DAY=Evening, URBAN    0.437       0.606 1.205
       TIME_OF_DAY=Evening, MALE            DARK_CONDITIONS    0.472       0.861 1.195
                 DARK_CONDITIONS  TIME_OF_DAY=Evening, MALE    0.472       0.655 1.195
           DARK_CONDITIONS, MALE TIME_OF_DAY=Evening, URBAN    0.415       0.601 1.193
      TIME_OF_DAY=Evening, URBAN      DARK_CONDITIONS, MALE    0.415       0.825 1.193
                 DARK_CONDITIONS        TIME_OF_DAY=Evening    0.497       0.690 1.192
             TIME_OF_DAY=Evening            DARK_CONDITIONS    0.497       0.859 1.192
           DARK_CONDITIONS, MALE        TIME_OF_DAY=Eveni

In [14]:
analyze_cluster_rules(4)

ANALYZING CLUSTER: 4
Transactions: 6768

--- [SCRATCH] Top 15 Rules ---

                            Antecedents                           Consequents  Support  Confidence  Lift
                 URBAN, DARK_CONDITIONS                     TIME_OF_DAY=Night    0.513       0.706 1.207
                      TIME_OF_DAY=Night                URBAN, DARK_CONDITIONS    0.513       0.877 1.207
       PASSENGER_CAR, TIME_OF_DAY=Night                URBAN, DARK_CONDITIONS    0.496       0.876 1.206
                 URBAN, DARK_CONDITIONS      PASSENGER_CAR, TIME_OF_DAY=Night    0.496       0.683 1.206
  PASSENGER_CAR, URBAN, DARK_CONDITIONS                     TIME_OF_DAY=Night    0.496       0.702 1.201
                      TIME_OF_DAY=Night PASSENGER_CAR, URBAN, DARK_CONDITIONS    0.496       0.848 1.201
                        DARK_CONDITIONS                     TIME_OF_DAY=Night    0.584       0.682 1.165
                      TIME_OF_DAY=Night                       DARK_CONDITIONS    0.584 

In [15]:
analyze_cluster_rules(5)

ANALYZING CLUSTER: 5
Transactions: 2856

--- [SCRATCH] Top 15 Rules ---

                                Antecedents                                 Consequents  Support  Confidence  Lift
      DARK_CONDITIONS, WEEKEND_FLAG=Weekday                         TIME_OF_DAY=Evening    0.414       0.601 1.199
                        TIME_OF_DAY=Evening       DARK_CONDITIONS, WEEKEND_FLAG=Weekday    0.414       0.826 1.199
                        TIME_OF_DAY=Evening                             DARK_CONDITIONS    0.435       0.867 1.193
  TIME_OF_DAY=Evening, WEEKEND_FLAG=Weekday                             DARK_CONDITIONS    0.414       0.867 1.193
                      AGE_GROUP=Adult, MALE           LARGE_TRUCK, WEEKEND_FLAG=Weekday    0.461       0.635 1.169
          LARGE_TRUCK, WEEKEND_FLAG=Weekday                       AGE_GROUP=Adult, MALE    0.461       0.849 1.169
                      AGE_GROUP=Adult, MALE                                 LARGE_TRUCK    0.502       0.691 1.164
       