In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Order1.csv')

In [3]:
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [4]:
#Checking for missing values
df.isnull().sum()

Member_number      0
Date               0
itemDescription    0
dtype: int64

In [7]:
#Transaction list
transactions_series = df.groupby(['Member_number','Date'])['itemDescription'].apply(list)
transactions = list(transactions_series)
print(transactions[0:5])
print(len(transactions))

[['sausage', 'whole milk', 'semi-finished bread', 'yogurt'], ['whole milk', 'pastry', 'salty snack'], ['canned beer', 'misc. beverages'], ['sausage', 'hygiene articles'], ['soda', 'pickled vegetables']]
14963


In [10]:
#As Apriori algorithm needs encoding so we will apply OHE
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_onehot = pd.DataFrame(te_ary,columns=te.columns_)

df_onehot.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
#Train Apriori
from mlxtend.frequent_patterns import apriori, association_rules

In [13]:
#First, calculate minimum support using the formula....
min_support = 50 / len(transactions)

#Second, train Apriori
frequent_itemsets = apriori(df_onehot,min_support=min_support,use_colnames=True)

print(frequent_itemsets.sort_values(by='support',ascending=False).head(10))

      support            itemsets
102  0.157923        (whole milk)
65   0.122101  (other vegetables)
78   0.110005        (rolls/buns)
87   0.097106              (soda)
103  0.085879            (yogurt)
79   0.069572   (root vegetables)
95   0.067767    (tropical fruit)
7    0.060683     (bottled water)
82   0.060349           (sausage)
22   0.053131      (citrus fruit)


In [15]:
#Generate rules (considering lift here for positive relation)
rules = association_rules(frequent_itemsets,metric="lift",min_threshold=1.0)

#Sorting rules by confidence and then by lift for strong and meaningful rules only
rules_sorted = rules.sort_values(by=['confidence','lift'],ascending=False)

print(rules_sorted.head(10))

                antecedents         consequents  antecedent support  \
4             (frankfurter)  (other vegetables)            0.037760   
6   (fruit/vegetable juice)        (rolls/buns)            0.034017   
11                (sausage)              (soda)            0.060349   
12                (sausage)            (yogurt)            0.060349   
2            (citrus fruit)            (yogurt)            0.053131   
1            (bottled beer)           (sausage)            0.045312   
8           (shopping bags)   (root vegetables)            0.047584   
13                 (yogurt)           (sausage)            0.085879   
10                   (soda)           (sausage)            0.097106   
0                 (sausage)      (bottled beer)            0.060349   

    consequent support   support  confidence      lift  representativity  \
4             0.122101  0.005146    0.136283  1.116150               1.0   
6             0.110005  0.003743    0.110020  1.000136            

In [20]:
item_in = input("Enter an item to find rules for (e.g., whole milk): ").strip()

item_set = frozenset({item_in})

matching_rules = rules_sorted[rules_sorted['antecedents'] == item_set]

if matching_rules.empty:
    print(f"\nNo rules found where '{item_in}' is the only item.")
else:
    print(f"\n--- Rules for 'IF customer buys {item_in}' ---")
    print(matching_rules)

Enter an item to find rules for (e.g., whole milk):  sausage



--- Rules for 'IF customer buys sausage' ---
   antecedents     consequents  antecedent support  consequent support  \
11   (sausage)          (soda)            0.060349            0.097106   
12   (sausage)        (yogurt)            0.060349            0.085879   
0    (sausage)  (bottled beer)            0.060349            0.045312   

     support  confidence      lift  representativity  leverage  conviction  \
11  0.005948    0.098560  1.014975               1.0  0.000088    1.001613   
12  0.005748    0.095238  1.108986               1.0  0.000565    1.010345   
0   0.003342    0.055371  1.222000               1.0  0.000607    1.010649   

    zhangs_metric   jaccard  certainty  kulczynski  
11       0.015702  0.039259   0.001611    0.079906  
12       0.104587  0.040913   0.010239    0.081082  
0        0.193337  0.032658   0.010537    0.064559  
