In [36]:
import numpy as np
import pandas as pd

DataSet Loading And Inspection

In [37]:
df=pd.read_csv('Groceries_dataset.csv')

In [38]:
print(df.shape)
print(df.head())
print(df.info())
print(df.describe())

(38765, 3)
   Member_number        Date   itemDescription
0           1808  21-07-2015    tropical fruit
1           2552  05-01-2015        whole milk
2           2300  19-09-2015         pip fruit
3           1187  12-12-2015  other vegetables
4           3037  01-02-2015        whole milk
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB
None
       Member_number
count   38765.000000
mean     3003.641868
std      1153.611031
min      1000.000000
25%      2002.000000
50%      3005.000000
75%      4007.000000
max      5000.000000


Normalise item names

In [39]:
import re

In [40]:
def clean_item(item):
    if pd.isna(item):
        return ""
    item = str(item).lower()                 # lowercase
    item = re.sub(r'[^a-z\s]', '', item)     # keep only letters & spaces
    item = re.sub(r'\s+', ' ', item).strip() # collapse multiple spaces
    return item


In [41]:
df['cleaned'] = df['itemDescription'].apply(clean_item)
print(df[['itemDescription', 'cleaned']].head(10))

    itemDescription           cleaned
0    tropical fruit    tropical fruit
1        whole milk        whole milk
2         pip fruit         pip fruit
3  other vegetables  other vegetables
4        whole milk        whole milk
5        rolls/buns         rollsbuns
6  other vegetables  other vegetables
7        pot plants        pot plants
8        whole milk        whole milk
9    tropical fruit    tropical fruit


Manual mapping

In [42]:
mapping = {
    # Dairy
    "milk": "dairy",
    "butter milk": "dairy",
    "cream": "dairy",
    "cream cheese": "dairy",
    "curd cheese": "dairy",
    "processed cheese": "dairy",
    "soft cheese": "dairy",
    "spread cheese": "dairy",
    "whippedsour cream": "dairy",
    
    # Beverages
    "beverages": "beverages",
    "cocoa drinks": "beverages",
    "instant coffee": "beverages",
    "liqueur": "beverages",
    "liquor": "beverages",
    "rum": "beverages",
    "whisky": "beverages",
    "sparkling wine": "beverages",
    "redblush wine": "beverages",
    "white wine": "beverages",
    "bottled beer": "beverages",
    "canned beer": "beverages",
    
    # Fruits & Vegetables
    "fruits": "fruits",
    "grapes": "fruits",
    "frozen fruits": "fruits",
    "vegetables": "vegetables",
    "onions": "vegetables",
    "potato products": "vegetables",
    "packaged fruitvegetables": "fruits_vegetables",
    
    # Meat & Fish
    "meat": "meat",
    "chicken": "meat",
    "frozen chicken": "meat",
    "pork": "meat",
    "ham": "meat",
    "hamburger meat": "meat",
    "frankfurter": "meat",
    "fish": "fish",
    "frozen fish": "fish",
    "turkey": "meat",
    
    # Bakery & Sweets
    "bakery": "bakery",
    "long life bakery product": "bakery",
    "white bread": "bakery",
    "roll products": "bakery",
    "rollsbuns": "bakery",
    "waffles": "bakery",
    "cake bar": "sweets",
    "chocolate marshmallow": "sweets",
    "chocolate": "sweets",
    "candy": "sweets",
    "sweets": "sweets",
    "specialty chocolate": "sweets",
    "tidbits": "sweets",
    
    # Snacks
    "snack products": "snacks",
    "salty snack": "snacks",
    "nut snack": "snacks",
    "popcorn": "snacks",
    
    # Pantry / Cooking Ingredients
    "sugar": "sweeteners",
    "salt": "spices",
    "spices": "spices",
    "sauces": "condiments",
    "ketchup": "condiments",
    "mayonnaise": "condiments",
    "salad dressing": "condiments",
    "oil": "oils",
    "mustard": "oils",
    "vinegar": "condiments",
    "baking powder": "baking_ingredients",
    "pasta": "grains",
    "rice": "grains",
    "flour": "grains",
    "pudding powder": "baking_ingredients",
    "ready soups": "ready_meals",
    "instant food products": "ready_meals",
    
    # Household / Cleaning
    "abrasive cleaner": "cleaning",
    "bathroom cleaner": "cleaning",
    "dish cleaner": "cleaning",
    "cleaner": "cleaning",
    "toilet cleaner": "cleaning",
    "detergent": "cleaning",
    "softener": "cleaning",
    "kitchen towels": "household",
    "house keeping products": "household",
    
    # Personal Care
    "skin care": "personal_care",
    "hair spray": "personal_care",
    "male cosmetics": "personal_care",
    "female sanitary products": "personal_care",
    "make up remover": "personal_care",
    
    # Pets
    "cat food": "pet_care",
    "dog food": "pet_care",
    "pet care": "pet_care",
    
    # Misc
    "bags": "bags",
    "candles": "decor",
    "flower seeds": "gardening",
    "flower soilfertilizer": "gardening",
    "pot plants": "gardening",
    "photofilm": "electronics",
    "newspapers": "stationery",
    "shopping bags": "bags",
    "cling filmbags": "bags",
    "finished products": "misc",
    "organic products": "organic",
    "organic sausage": "meat",
    "organic foods": "organic"  # for any other remaining organic
}



In [43]:
def map_item(item):
    return mapping.get(item, item)  # Default to original 
df['mapped'] = df['cleaned'].apply(map_item)
print(df[['cleaned', 'mapped']].head(10))

            cleaned            mapped
0    tropical fruit    tropical fruit
1        whole milk        whole milk
2         pip fruit         pip fruit
3  other vegetables  other vegetables
4        whole milk        whole milk
5         rollsbuns            bakery
6  other vegetables  other vegetables
7        pot plants         gardening
8        whole milk        whole milk
9    tropical fruit    tropical fruit


Group items by Member

In [44]:
baskets=df.groupby('Member_number')['mapped'].apply(lambda x:list(set(x)))
baskets.head()

Member_number
1000    [soda, yogurt, sausage, snacks, whole milk, mi...
1001    [soda, sausage, whole milk, bakery, beef, curd...
1002    [sweeteners, other vegetables, whole milk, tro...
1003    [sausage, cleaning, bakery, dental care, root ...
1004    [bags, other vegetables, bakery, whole milk, c...
Name: mapped, dtype: object

In [45]:
all_items = [item for sublist in baskets for item in sublist]

# See unique items
unique_items = sorted(set(all_items))
print(len(unique_items))

99


In [46]:
# Drop rare item categories before encoding
value_counts = df['mapped'].value_counts()
common_items = value_counts[value_counts >= 10].index 
filtered_baskets = baskets.apply(lambda items: [x for x in items if x in common_items])



In [47]:
from mlxtend.preprocessing import TransactionEncoder

In [48]:
transaction = filtered_baskets.tolist()

In [49]:
t = TransactionEncoder()

In [50]:
to_array = t.fit(transaction).transform(transaction)
encoded = pd.DataFrame(to_array, columns = t.columns_)

In [51]:
print(encoded.head())

   artif sweetener   bags  bakery  baking_ingredients   beef  berries  \
0            False  False   False               False  False    False   
1            False  False    True               False   True    False   
2            False  False   False               False  False    False   
3            False  False    True               False  False    False   
4            False   True    True               False  False    False   

   beverages  bottled water  brandy  brown bread  ...  sweeteners  sweets  \
0       True          False   False        False  ...       False   False   
1      False          False   False        False  ...       False   False   
2      False          False   False        False  ...        True    True   
3      False          False   False        False  ...       False   False   
4       True          False   False        False  ...       False    True   

   syrup    tea  tropical fruit  uhtmilk  vegetables  whole milk  yogurt  \
0  False  False       

In [52]:
df1 = pd.DataFrame(to_array.astype(int),columns = t.columns_)

In [53]:
print(df1.head())

   artif sweetener  bags  bakery  baking_ingredients  beef  berries  \
0                0     0       0                   0     0        0   
1                0     0       1                   0     1        0   
2                0     0       0                   0     0        0   
3                0     0       1                   0     0        0   
4                0     1       1                   0     0        0   

   beverages  bottled water  brandy  brown bread  ...  sweeteners  sweets  \
0          1              0       0            0  ...           0       0   
1          0              0       0            0  ...           0       0   
2          0              0       0            0  ...           1       1   
3          0              0       0            0  ...           0       0   
4          1              0       0            0  ...           0       1   

   syrup  tea  tropical fruit  uhtmilk  vegetables  whole milk  yogurt  \
0      0    0               0       

Apriori

In [54]:
from mlxtend.frequent_patterns import fpgrowth

In [55]:
print(encoded.columns.tolist())


['artif sweetener', 'bags', 'bakery', 'baking_ingredients', 'beef', 'berries', 'beverages', 'bottled water', 'brandy', 'brown bread', 'butter', 'canned fish', 'canned fruit', 'canned vegetables', 'cereals', 'chewing gum', 'citrus fruit', 'cleaning', 'coffee', 'condensed milk', 'condiments', 'cooking chocolate', 'cookware', 'curd', 'dairy', 'decor', 'dental care', 'dessert', 'dishes', 'domestic eggs', 'electronics', 'fish', 'frozen dessert', 'frozen meals', 'frozen potato products', 'frozen vegetables', 'fruits', 'fruits_vegetables', 'fruitvegetable juice', 'gardening', 'grains', 'hard cheese', 'herbs', 'honey', 'household', 'hygiene articles', 'ice cream', 'jam', 'light bulbs', 'liquor appetizer', 'liver loaf', 'margarine', 'meat', 'meat spreads', 'misc', 'misc beverages', 'napkins', 'nutsprunes', 'oils', 'organic', 'other vegetables', 'pastry', 'personal_care', 'pet_care', 'pickled vegetables', 'pip fruit', 'prosecco', 'ready_meals', 'root vegetables', 'sausage', 'seasonal products', 

In [56]:
print(encoded.shape)

(3898, 94)


In [57]:
freq = fpgrowth(subset_encoded, min_support=0.1, use_colnames=True)


In [58]:
print(encoded.columns.tolist())


['artif sweetener', 'bags', 'bakery', 'baking_ingredients', 'beef', 'berries', 'beverages', 'bottled water', 'brandy', 'brown bread', 'butter', 'canned fish', 'canned fruit', 'canned vegetables', 'cereals', 'chewing gum', 'citrus fruit', 'cleaning', 'coffee', 'condensed milk', 'condiments', 'cooking chocolate', 'cookware', 'curd', 'dairy', 'decor', 'dental care', 'dessert', 'dishes', 'domestic eggs', 'electronics', 'fish', 'frozen dessert', 'frozen meals', 'frozen potato products', 'frozen vegetables', 'fruits', 'fruits_vegetables', 'fruitvegetable juice', 'gardening', 'grains', 'hard cheese', 'herbs', 'honey', 'household', 'hygiene articles', 'ice cream', 'jam', 'light bulbs', 'liquor appetizer', 'liver loaf', 'margarine', 'meat', 'meat spreads', 'misc', 'misc beverages', 'napkins', 'nutsprunes', 'oils', 'organic', 'other vegetables', 'pastry', 'personal_care', 'pet_care', 'pickled vegetables', 'pip fruit', 'prosecco', 'ready_meals', 'root vegetables', 'sausage', 'seasonal products', 

In [59]:
print(encoded.columns.tolist())


['artif sweetener', 'bags', 'bakery', 'baking_ingredients', 'beef', 'berries', 'beverages', 'bottled water', 'brandy', 'brown bread', 'butter', 'canned fish', 'canned fruit', 'canned vegetables', 'cereals', 'chewing gum', 'citrus fruit', 'cleaning', 'coffee', 'condensed milk', 'condiments', 'cooking chocolate', 'cookware', 'curd', 'dairy', 'decor', 'dental care', 'dessert', 'dishes', 'domestic eggs', 'electronics', 'fish', 'frozen dessert', 'frozen meals', 'frozen potato products', 'frozen vegetables', 'fruits', 'fruits_vegetables', 'fruitvegetable juice', 'gardening', 'grains', 'hard cheese', 'herbs', 'honey', 'household', 'hygiene articles', 'ice cream', 'jam', 'light bulbs', 'liquor appetizer', 'liver loaf', 'margarine', 'meat', 'meat spreads', 'misc', 'misc beverages', 'napkins', 'nutsprunes', 'oils', 'organic', 'other vegetables', 'pastry', 'personal_care', 'pet_care', 'pickled vegetables', 'pip fruit', 'prosecco', 'ready_meals', 'root vegetables', 'sausage', 'seasonal products', 

In [60]:
from mlxtend.frequent_patterns import association_rules


In [61]:
rules = association_rules(freq,metric='lift',min_threshold=1)
rules = rules[['antecedents','consequents','support','confidence','lift']]

In [62]:
filtering = rules[(rules['confidence']>0.6) & (rules['lift']>1.2)]
print(filtering)

                         antecedents                      consequents  \
0                             (soda)                         (sweets)   
1                           (sweets)                           (soda)   
11                    (soda, bakery)                         (sweets)   
17              (bakery, whole milk)               (other vegetables)   
23                          (pastry)                         (sweets)   
..                               ...                              ...   
946                         (coffee)                   (citrus fruit)   
948        (coffee, root vegetables)                   (citrus fruit)   
949           (coffee, citrus fruit)                (root vegetables)   
950  (root vegetables, citrus fruit)                         (coffee)   
951                         (coffee)  (root vegetables, citrus fruit)   

     support  confidence      lift  
0       0.25    0.625000  1.562500  
1       0.25    0.625000  1.562500  
11      0.10