In [9]:
import pandas as pd
from ast import literal_eval
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

You shall already be familiar with the concept of association rules and the apriori algorithm. Association rule mining is a method for discovering patterns within large data sets. It focuses on identifying relationships between variables and leveraging those connections to make predictions or informed decisions. The primary objective is to uncover rules that reveal the associations between various items in the data.



### Task
Load data from data.txt file - it contains lists of grocery shopping done by nearly 2000 customers.
Store it in a boolean one hot encoded dataframe - True for items bought in a given transaction, False otherwise.

In [10]:
raw_entries = []
with open("data.txt") as f:
    for line in f.readlines():
        if line == '':
            continue
        raw_entries.append(tuple(literal_eval(line)))

In [11]:
entries_series = pd.Series(raw_entries)
hot_encoded = entries_series.map(lambda x: '|'.join(x)).str.get_dummies().astype(bool)

To extract rules you can use e.g. apriori algorithm implemented in mlxtend. There are other algorithms performing the same task but using different approaches e.g. fpgrowth internally uses a tree-based structure which makes it faster in most real-life examples.

### Task
Find association rules using selected algorithm

In [12]:
apriorised = apriori(hot_encoded, min_support=0.01, use_colnames=True)

In [13]:
rules = association_rules(apriorised, metric="confidence", min_threshold=0)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(apple),(banana),0.151879,0.165449,0.062630,0.412371,2.492439,1.0,0.037502,1.420201,0.706015,0.245902,0.295874,0.395460
1,(banana),(apple),0.165449,0.151879,0.062630,0.378549,2.492439,1.0,0.037502,1.364743,0.717495,0.245902,0.267261,0.395460
2,(apple),(beef),0.151879,0.127871,0.013570,0.089347,0.698731,1.0,-0.005851,0.957697,-0.337037,0.050980,-0.044172,0.097735
3,(beef),(apple),0.127871,0.151879,0.013570,0.106122,0.698731,1.0,-0.005851,0.948811,-0.330828,0.050980,-0.053950,0.097735
4,(apple),(bread),0.151879,0.253653,0.041754,0.274914,1.083818,1.0,0.003229,1.029322,0.091185,0.114778,0.028486,0.219762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1747,"(yogurt, sausage)","(grill, mustard)",0.057933,0.057411,0.011482,0.198198,3.452252,1.0,0.008156,1.175588,0.754017,0.110553,0.149362,0.199099
1748,(grill),"(mustard, yogurt, sausage)",0.113257,0.012526,0.011482,0.101382,8.093702,1.0,0.010064,1.098881,0.988389,0.100457,0.089984,0.509025
1749,(mustard),"(grill, yogurt, sausage)",0.080898,0.024008,0.011482,0.141935,5.911921,1.0,0.009540,1.137434,0.903980,0.122905,0.120828,0.310098
1750,(yogurt),"(grill, mustard, sausage)",0.237474,0.057411,0.011482,0.048352,0.842198,1.0,-0.002151,0.990480,-0.197253,0.040516,-0.009611,0.124176


### Task
Can you find dissociation rules (rules with "negative knowledge") e.g. if beer and sausage then mustard and NOT red wine?
The simplest approach would be to just treat lack of a given item in a basket as a separate item. Instead of having a basket with A and B, represent it as A, B, NOT C, NOT D, and NOT E. Then proceed with the classical rule mining algorithm

In [14]:
inverted_encoded = ~hot_encoded
inverted_encoded = inverted_encoded.add_prefix('not_')
expanded_encoded = pd.concat([hot_encoded, inverted_encoded], axis=1)

In [15]:
def has_positive_item(row):
    return any(not item.startswith('not_') for item in row)

In [16]:
expanded_apriorised = apriori(expanded_encoded, min_support=0.3, use_colnames=True)
all_rules = association_rules(expanded_apriorised, metric="confidence", min_threshold=0.5)

In [18]:
filtered_rules = all_rules[
    all_rules['antecedents'].apply(lambda s: all(not item.startswith('not_') for item in s)) &
    all_rules['consequents'].apply(lambda s: all(item.startswith('not_') for item in s))
]

In [19]:
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(chicken),(not_apple),0.420668,0.848121,0.358038,0.851117,1.003532,1.0,0.001260,1.020120,0.006075,0.393123,0.019723,0.636635
1,(chicken),(not_banana),0.420668,0.834551,0.353340,0.839950,1.006470,1.0,0.002271,1.033735,0.011096,0.391782,0.032634,0.631670
2,(chicken),(not_beef),0.420668,0.872129,0.411795,0.978908,1.122435,1.0,0.044918,6.062569,0.188285,0.467417,0.835053,0.725540
3,(chicken),(not_bread),0.420668,0.746347,0.316284,0.751861,1.007389,1.0,0.002320,1.022223,0.012660,0.371779,0.021740,0.587819
4,(chicken),(not_butter),0.420668,0.992693,0.416493,0.990074,0.997362,1.0,-0.001102,0.736169,-0.004545,0.417801,-0.358384,0.704816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1714950,(chicken),"(not_pork, not_mustard, not_butter, not_wagyu,...",0.420668,0.527662,0.307411,0.730769,1.384920,1.0,0.085441,1.754399,0.479753,0.479642,0.430004,0.656680
1715116,(chicken),"(not_pork, not_mustard, not_butter, not_wagyu,...",0.420668,0.585073,0.339770,0.807692,1.380498,1.0,0.093649,2.157620,0.475761,0.510188,0.536526,0.694212
1715268,(chicken),"(not_pork, not_mustard, not_wagyu, not_ketchup...",0.420668,0.524008,0.304280,0.723325,1.380369,1.0,0.083846,1.720400,0.475645,0.475143,0.418740,0.652001
1715411,(chicken),"(not_pork, not_mustard, not_butter, not_wagyu,...",0.420668,0.599687,0.311065,0.739454,1.233067,1.0,0.058796,1.536440,0.326262,0.438558,0.349145,0.629083


The problem is now we have a lot of data, even though we use a toy example. In real shops there are thousands of articles so each basket will be huge.

We need to find a better way of finding reasonable negative rules

Let's say we have a rule if A then not B, where A and B are sets of items. To make this rule 'interesting' B itself shall be a frequent set, same obviously with A. What is more if A and B together are frequent then rule if A then not B doesn't seem valid. Based on these assumptions you can reduce number of potential negative rules to check



In [21]:
def is_interesting_negative_rule(rule, frequent_sets):
    antecedent = rule['antecedents']
    consequent = rule['consequents']

    is_A_positive = all(not item.startswith('not_') for item in antecedent)
    is_C_negative = all(item.startswith('not_') for item in consequent)

    if not (is_A_positive and is_C_negative and len(antecedent) > 0 and len(consequent) > 0):
        return False

    positive_consequent = frozenset(item.replace('not_', '') for item in consequent)

    A_is_frequent = antecedent in frequent_sets
    B_is_frequent = positive_consequent in frequent_sets
    A_union_B_is_frequent = antecedent.union(positive_consequent) in frequent_sets

    return A_is_frequent and B_is_frequent and not A_union_B_is_frequent

In [22]:
interesting_negative_rules = all_rules[all_rules.apply(is_interesting_negative_rule, axis=1, frequent_sets=set(apriorised['itemsets']))]

In [23]:
interesting_negative_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
2,(chicken),(not_beef),0.420668,0.872129,0.411795,0.978908,1.122435,1.0,0.044918,6.062569,0.188285,0.467417,0.835053,0.725540
341,(chicken),"(not_beef, not_apple)",0.420668,0.733820,0.350209,0.832506,1.134482,1.0,0.041514,1.589190,0.204616,0.435432,0.370749,0.654873
347,(chicken),"(not_eggs, not_apple)",0.420668,0.767745,0.322025,0.765509,0.997087,1.0,-0.000941,0.990462,-0.005018,0.371687,-0.009630,0.592476
350,(chicken),"(not_grill, not_apple)",0.420668,0.748434,0.316284,0.751861,1.004579,1.0,0.001442,1.013810,0.007867,0.370869,0.013622,0.587228
363,(chicken),"(not_pork, not_apple)",0.420668,0.671712,0.345511,0.821340,1.222756,1.0,0.062944,1.837501,0.314458,0.462614,0.455783,0.667857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6605,(chicken),"(not_chocolate, not_grill, not_mustard)",0.420668,0.706681,0.303236,0.720844,1.020042,1.0,0.005958,1.050735,0.033915,0.367954,0.048286,0.574971
6808,(chicken),"(not_mustard, not_grill, not_pork)",0.420668,0.682672,0.355950,0.846154,1.239473,1.0,0.068771,2.062630,0.333497,0.476257,0.515182,0.683780
6846,(chicken),"(not_sausage, not_grill, not_pork)",0.420668,0.588205,0.305846,0.727047,1.236045,1.0,0.058407,1.508669,0.329634,0.435041,0.337164,0.623506
6970,(chicken),"(not_mustard, not_sausage, not_pork)",0.420668,0.574635,0.304280,0.723325,1.258756,1.0,0.062549,1.537419,0.354831,0.440332,0.349559,0.626422


### Task
The association rules are characterized by high support - frequency in the dataset. Can you use this algorithm as a base and try to extract different types of rules:
 - low frequency but strong relation rules e.g. buying Porshe and Rolex is not frequent in the dataset, but usually people who bought Porshe also bought Rolex
 - negative rules e.g. if someone bought low-fat milk it's unlikely there will be whole milk in the basket
 - disjunction e.g. eggs and (kielecki xor winiary ;) )
 - imagine 50% of baskets have milk and 50% of baskets have tea. If there is no relation between them then in ~25% of baskets we will have both. If milk appears together with tea in e.g. 40% of baskets it means there is a pattern. Can you find such rules and use statistical tests to check if the relation is strong?

 Send the report within 144 hours starting from the end of this class to gmiebs@cs.put.poznan.pl; start this email's subject with [IR]


In [24]:
# 1. LF rules
low_support_apriorised = apriori(hot_encoded, min_support=0.005, use_colnames=True)
strong_relation_rules = association_rules(low_support_apriorised, metric="lift", min_threshold=5)

print(strong_relation_rules.sort_values(by=['confidence', 'lift'], ascending=False).head())

                            antecedents        consequents  \
1101  (apple, ketchup, sausage, orange)    (grill, banana)   
1106    (grill, banana, apple, ketchup)  (sausage, orange)   
729              (grill, apple, orange)  (banana, sausage)   
1095    (grill, apple, ketchup, orange)  (banana, sausage)   
855              (mustard, bread, milk)   (grill, sausage)   

      antecedent support  consequent support   support  confidence       lift  \
1101            0.005219            0.019833  0.005219         1.0  50.421053   
1106            0.005219            0.034969  0.005219         1.0  28.597015   
729             0.008351            0.043319  0.008351         1.0  23.084337   
1095            0.005219            0.043319  0.005219         1.0  23.084337   
855             0.007829            0.105428  0.007829         1.0   9.485149   

      representativity  leverage  conviction  zhangs_metric   jaccard  \
1101               1.0  0.005116         inf       0.985310  0.2631

In [25]:
# 2. Negative rules
all_rules_for_negation = association_rules(apriorised, metric="confidence", min_threshold=0)
negative_correlation_rules = all_rules_for_negation[all_rules_for_negation['lift'] < 0.5]

print(negative_correlation_rules.sort_values(by='lift', ascending=True).head())

    antecedents consequents  antecedent support  consequent support   support  \
134   (chicken)      (pork)            0.420668            0.205115  0.013048   
135      (pork)   (chicken)            0.205115            0.420668  0.013048   

     confidence     lift  representativity  leverage  conviction  \
134    0.031017  0.15122               1.0 -0.073237    0.820330   
135    0.063613  0.15122               1.0 -0.073237    0.618689   

     zhangs_metric   jaccard  certainty  kulczynski  
134      -0.906442  0.021295  -0.219022    0.047315  
135      -0.875950  0.021295  -0.616322    0.047315  


In [29]:
# 3. Disjunction rules
xor_rules = all_rules[
    (all_rules['consequents'] == frozenset({'chicken', 'not_wagyu'})) |
    (all_rules['consequents'] == frozenset({'wagyu', 'not_chicken'}))
]

print(xor_rules)

                                               antecedents  \
607                                             (not_pork)   
5575                                 (not_pork, not_apple)   
5796                                (not_pork, not_banana)   
6165                                  (not_pork, not_beef)   
6213                                 (not_pork, not_bread)   
...                                                    ...   
1714818  (not_pork, not_mustard, not_butter, not_ketchu...   
1714966  (not_pork, not_mustard, not_butter, not_ketchu...   
1715136  (not_pork, not_mustard, not_ketchup, not_beef,...   
1715283  (not_pork, not_mustard, not_butter, not_ketchu...   
2991190  (not_pork, not_mustard, not_butter, not_ketchu...   

                  consequents  antecedent support  consequent support  \
607      (not_wagyu, chicken)            0.794885            0.416493   
5575     (not_wagyu, chicken)            0.671712            0.416493   
5796     (not_wagyu, chicken)       

In [30]:
from scipy.stats import chi2
import numpy as np

# 4. Statistically significant
N = len(hot_encoded)

obs_expected_diff = N * np.abs(rules['support'] -
                               (rules['antecedent support'] * rules['consequent support']))

numerator = (obs_expected_diff - 0.5) ** 2

term_A = rules['antecedent support'] * (1 - rules['antecedent support'])
term_B = rules['consequent support'] * (1 - rules['consequent support'])
denominator = N * term_A * term_B

rules['chi2'] = numerator / denominator
rules['p_value'] = chi2.sf(rules['chi2'], df=1)

significant_rules = rules[(rules['p_value'] < 0.05) & (rules['lift'] > 1.5)]

print("\nStatistically Significant Rules (p < 0.05):")
print(significant_rules.sort_values(by='lift', ascending=False).head())



Statistically Significant Rules (p < 0.05):
             antecedents         consequents  antecedent support  \
1323     (grill, banana)  (sausage, ketchup)            0.019833   
1326  (sausage, ketchup)     (grill, banana)            0.052192   
1648  (mustard, sausage)  (grill, chocolate)            0.060543   
1645  (grill, chocolate)  (mustard, sausage)            0.020355   
1490     (grill, cheese)  (ketchup, sausage)            0.023486   

      consequent support   support  confidence       lift  representativity  \
1323            0.052192  0.011482    0.578947  11.092632               1.0   
1326            0.019833  0.011482    0.220000  11.092632               1.0   
1648            0.020355  0.013048    0.215517  10.587975               1.0   
1645            0.060543  0.013048    0.641026  10.587975               1.0   
1490            0.052192  0.012004    0.511111   9.792889               1.0   

      leverage  conviction  zhangs_metric   jaccard  certainty  kulczyn