In [1]:
import numpy as np
import pandas as pd

DataSet Loading And Inspection

In [2]:
df=pd.read_csv('Groceries_dataset.csv')

In [3]:
print(df.shape)
print(df.info())

(38765, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB
None


Normalise item names

In [4]:
import re

In [5]:
def clean_item(item):
    if pd.isna(item):
        return ""
    item = str(item).lower()                 # lowercase
    item = re.sub(r'[^a-z\s]', '', item)     # keep only letters & spaces
    item = re.sub(r'\s+', ' ', item).strip() # collapse multiple spaces
    return item


In [6]:
df['cleaned'] = df['itemDescription'].apply(clean_item)
print(df[['itemDescription', 'cleaned']].head(2))

  itemDescription         cleaned
0  tropical fruit  tropical fruit
1      whole milk      whole milk


Manual mapping

In [7]:
mapping = {
    # Dairy
    "milk": "dairy",
    "butter milk": "dairy",
    "cream": "dairy",
    "cream cheese": "dairy",
    "curd cheese": "dairy",
    "processed cheese": "dairy",
    "soft cheese": "dairy",
    "spread cheese": "dairy",
    "whippedsour cream": "dairy",
    
    # Beverages
    "beverages": "beverages",
    "cocoa drinks": "beverages",
    "instant coffee": "beverages",
    "liqueur": "beverages",
    "liquor": "beverages",
    "rum": "beverages",
    "whisky": "beverages",
    "sparkling wine": "beverages",
    "redblush wine": "beverages",
    "white wine": "beverages",
    "bottled beer": "beverages",
    "canned beer": "beverages",
    
    # Fruits & Vegetables
    "fruits": "fruits",
    "grapes": "fruits",
    "frozen fruits": "fruits",
    "vegetables": "vegetables",
    "other vegetables": "vegetables",
    "frozen vegetables": "vegetables",
    "onions": "vegetables",
    "potato products": "vegetables",
    "packaged fruitvegetables": "fruits_vegetables",
    
    # Meat & Fish
    "meat": "meat",
    "chicken": "meat",
    "frozen chicken": "meat",
    "pork": "meat",
    "ham": "meat",
    "hamburger meat": "meat",
    "frankfurter": "meat",
    "fish": "fish",
    "frozen fish": "fish",
    "turkey": "meat",
    
    # Bakery & Sweets
    "bakery": "bakery",
    "long life bakery product": "bakery",
    "white bread": "bakery",
    "roll products": "bakery",
    "rollsbuns": "bakery",
    "waffles": "bakery",
    "cake bar": "sweets",
    "chocolate marshmallow": "sweets",
    "chocolate": "sweets",
    "candy": "sweets",
    "sweets": "sweets",
    "specialty chocolate": "sweets",
    "tidbits": "sweets",
    
    # Snacks
    "snack products": "snacks",
    "salty snack": "snacks",
    "nut snack": "snacks",
    "popcorn": "snacks",
    
    # Pantry / Cooking Ingredients
    "sugar": "sweeteners",
    "salt": "spices",
    "spices": "spices",
    "sauces": "condiments",
    "ketchup": "condiments",
    "mayonnaise": "condiments",
    "salad dressing": "condiments",
    "oil": "oils",
    "mustard": "oils",
    "vinegar": "condiments",
    "baking powder": "baking_ingredients",
    "pasta": "grains",
    "rice": "grains",
    "flour": "grains",
    "pudding powder": "baking_ingredients",
    "ready soups": "ready_meals",
    "instant food products": "ready_meals",
    
    # Household / Cleaning
    "abrasive cleaner": "cleaning",
    "bathroom cleaner": "cleaning",
    "dish cleaner": "cleaning",
    "cleaner": "cleaning",
    "toilet cleaner": "cleaning",
    "detergent": "cleaning",
    "softener": "cleaning",
    "kitchen towels": "household",
    "house keeping products": "household",
    
    # Personal Care
    "skin care": "personal_care",
    "hair spray": "personal_care",
    "male cosmetics": "personal_care",
    "female sanitary products": "personal_care",
    "make up remover": "personal_care",
    
    # Pets
    "cat food": "pet_care",
    "dog food": "pet_care",
    "pet care": "pet_care",
    
    # Misc
    "bags": "bags",
    "candles": "decor",
    "flower seeds": "gardening",
    "flower soilfertilizer": "gardening",
    "pot plants": "gardening",
    "photofilm": "electronics",
    "newspapers": "stationery",
    "shopping bags": "bags",
    "cling filmbags": "bags",
    "finished products": "misc",
    "organic products": "organic",
    "organic sausage": "meat",
    "organic foods": "organic"  # for any other remaining organic
}



In [8]:
def map_item(item):
    return mapping.get(item, item)  # Default to original 
df['mapped'] = df['cleaned'].apply(map_item)
print(df[['cleaned', 'mapped']].head(2))

          cleaned          mapped
0  tropical fruit  tropical fruit
1      whole milk      whole milk


Group items by Member

In [9]:
baskets=df.groupby('Member_number')['mapped'].apply(lambda x:list(set(x)))
baskets.head(2)

Member_number
1000    [whole milk, pastry, snacks, pickled vegetable...
1001    [beef, dairy, whole milk, curd, soda, meat, ba...
Name: mapped, dtype: object

In [10]:
all_items = [item for sublist in baskets for item in sublist]

# See unique items
unique_items = sorted(set(all_items))
print(len(unique_items))

97


In [11]:
# Drop rare item categories before encoding
value_counts = df['mapped'].value_counts()
common_items = value_counts[value_counts >= 10].index 
filtered_baskets = baskets.apply(lambda items: [x for x in items if x in common_items])



Transforming

In [12]:
from mlxtend.preprocessing import TransactionEncoder

In [13]:
transaction = filtered_baskets.tolist()

In [14]:
t = TransactionEncoder()

In [15]:
to_array = t.fit(transaction).transform(transaction)
encoded = pd.DataFrame(to_array, columns = t.columns_)

In [16]:
print(encoded.head(2))

   artif sweetener   bags  bakery  baking_ingredients   beef  berries  \
0            False  False   False               False  False    False   
1            False  False    True               False   True    False   

   beverages  bottled water  brandy  brown bread  ...  sweeteners  sweets  \
0       True          False   False        False  ...       False   False   
1      False          False   False        False  ...       False   False   

   syrup    tea  tropical fruit  uhtmilk  vegetables  whole milk  yogurt  \
0  False  False           False    False       False        True    True   
1  False  False           False    False       False        True   False   

   zwieback  
0     False  
1     False  

[2 rows x 92 columns]


In [17]:
df1 = pd.DataFrame(to_array.astype(int),columns = t.columns_)

In [18]:
print(df1.head(2))

   artif sweetener  bags  bakery  baking_ingredients  beef  berries  \
0                0     0       0                   0     0        0   
1                0     0       1                   0     1        0   

   beverages  bottled water  brandy  brown bread  ...  sweeteners  sweets  \
0          1              0       0            0  ...           0       0   
1          0              0       0            0  ...           0       0   

   syrup  tea  tropical fruit  uhtmilk  vegetables  whole milk  yogurt  \
0      0    0               0        0           0           1       1   
1      0    0               0        0           0           1       0   

   zwieback  
0         0  
1         0  

[2 rows x 92 columns]


Fpgrowth

In [19]:
from mlxtend.frequent_patterns import fpgrowth

In [20]:
print(encoded.columns.tolist())


['artif sweetener', 'bags', 'bakery', 'baking_ingredients', 'beef', 'berries', 'beverages', 'bottled water', 'brandy', 'brown bread', 'butter', 'canned fish', 'canned fruit', 'canned vegetables', 'cereals', 'chewing gum', 'citrus fruit', 'cleaning', 'coffee', 'condensed milk', 'condiments', 'cooking chocolate', 'cookware', 'curd', 'dairy', 'decor', 'dental care', 'dessert', 'dishes', 'domestic eggs', 'electronics', 'fish', 'frozen dessert', 'frozen meals', 'frozen potato products', 'fruits', 'fruits_vegetables', 'fruitvegetable juice', 'gardening', 'grains', 'hard cheese', 'herbs', 'honey', 'household', 'hygiene articles', 'ice cream', 'jam', 'light bulbs', 'liquor appetizer', 'liver loaf', 'margarine', 'meat', 'meat spreads', 'misc', 'misc beverages', 'napkins', 'nutsprunes', 'oils', 'organic', 'pastry', 'personal_care', 'pet_care', 'pickled vegetables', 'pip fruit', 'prosecco', 'ready_meals', 'root vegetables', 'sausage', 'seasonal products', 'semifinished bread', 'sliced cheese', 's

In [21]:
print(encoded.shape)

(3898, 92)


In [22]:
freq = fpgrowth(encoded, min_support=0.01, use_colnames=True)

In [23]:
print(encoded.columns.tolist())

['artif sweetener', 'bags', 'bakery', 'baking_ingredients', 'beef', 'berries', 'beverages', 'bottled water', 'brandy', 'brown bread', 'butter', 'canned fish', 'canned fruit', 'canned vegetables', 'cereals', 'chewing gum', 'citrus fruit', 'cleaning', 'coffee', 'condensed milk', 'condiments', 'cooking chocolate', 'cookware', 'curd', 'dairy', 'decor', 'dental care', 'dessert', 'dishes', 'domestic eggs', 'electronics', 'fish', 'frozen dessert', 'frozen meals', 'frozen potato products', 'fruits', 'fruits_vegetables', 'fruitvegetable juice', 'gardening', 'grains', 'hard cheese', 'herbs', 'honey', 'household', 'hygiene articles', 'ice cream', 'jam', 'light bulbs', 'liquor appetizer', 'liver loaf', 'margarine', 'meat', 'meat spreads', 'misc', 'misc beverages', 'napkins', 'nutsprunes', 'oils', 'organic', 'pastry', 'personal_care', 'pet_care', 'pickled vegetables', 'pip fruit', 'prosecco', 'ready_meals', 'root vegetables', 'sausage', 'seasonal products', 'semifinished bread', 'sliced cheese', 's

Association rules

In [24]:
from mlxtend.frequent_patterns import association_rules

In [25]:
rules = association_rules(freq,metric='lift',min_threshold=1)
rules = rules[['antecedents','consequents','support','confidence','lift']]

In [26]:
filtering = rules[(rules['confidence']>0.6) & (rules['lift']>1.2)]

In [27]:
filtering.sort_values(by='lift', ascending=False).head(2)

Unnamed: 0,antecedents,consequents,support,confidence,lift
44842,"(bags, dairy, meat, yogurt)","(bakery, whole milk)",0.010005,0.661017,2.73239
44836,"(bags, dairy, yogurt, whole milk)","(meat, bakery)",0.010005,0.619048,2.548097


Recommendation

In [28]:
def recommend_for_basket(user_basket, recommendation_dict, top_n=5):
    recommended_items = []
    for item in user_basket:
        recs = recommendation_dict.get(item, [])
        recommended_items.extend(recs)
    recommended_items = list(dict.fromkeys(recommended_items))
    recommended_items = [item for item in recommended_items if item not in user_basket]
    return recommended_items[:top_n]


In [29]:
# Build recommendation dictionary from rules
recommendation_dict = {}
for _, row in filtering.iterrows():
    for antecedent in row['antecedents']:
        if antecedent not in recommendation_dict:
            recommendation_dict[antecedent] = []
        recommendation_dict[antecedent].extend(list(row['consequents']))

for key in recommendation_dict:
    recommendation_dict[key] = list(set(recommendation_dict[key]))


Sample usage

In [30]:
user_basket = ['vegetables', 'dairy']
recommendations = recommend_for_basket(user_basket, recommendation_dict, top_n=3)

print("Your Basket:")
for item in user_basket:
    print(f"   - {item}")

print("\nRecommended Items:")
for idx, rec in enumerate(recommendations, 1):
    print(f"   {idx}. {rec}")


Your Basket:
   - vegetables
   - dairy

Recommended Items:
   1. whole milk
   2. beverages
   3. meat


In [None]:
# Save encoded matrix
df1.to_csv("encoded_baskets.csv", index=False)

# Also save mapping info 
df.to_csv("cleaned_groceries.csv", index=False)